# 49_zrtp_cb2_lock_body.patch
#
# 2.17 series. (2.12 equivalent: deps/patches/34_zrtp_cb2_lock_body.patch.)
#
# Follow-up to 47_zrtp_cb_lock_sync.patch. Patch 47 added cb_lock and made
# transport_detach() acquire it, but left transport_rtp_cb2() NOT acquiring
# it. Patch 47's own header predicted the remaining crash:
#
#   "cb2 doesn't take the lock, so the race window between 'cb2 NULL-check
#    passes' and 'cb2 calls stream_rtp_cb2' still exists. Closing that
#    window requires wrapping the cb2 body in cb_lock acquire/release
#    across multiple return paths."
#
# Observed (post-46+47) recurrence under load (sylkserver, 7+ parallel
# calls, fast hangup-during-playback):
#
#   Thread 4 "python3" received signal SIGSEGV, Segmentation fault.
#   #0  on_rx_rtp          at pjmedia/stream.c:1843         (2.12 deploy layout)
#   #1  transport_rtp_cb2  at pjmedia/transport_zrtp.c:934  (post-46 call-site offset)
#   #2  call_rtp_cb        at pjmedia/transport_udp.c:485
#   #3  on_rx_rtp          at pjmedia/transport_udp.c:556
#   #4  ioqueue_dispatch_read_event
#   #5  pj_ioqueue_poll
#   #6  worker_proc        at pjmedia/endpoint.c:349
#
# Root cause -- the window patch 47 left open
# -------------------------------------------
# transport_detach() acquires cb_lock before NULLing the callback
# pointers and returning; after it returns, pjmedia_stream_destroy frees
# the stream pool. But transport_rtp_cb2() never acquires cb_lock, so the
# lock provides NO mutual exclusion between detach and a running callback:
#
#   worker:   enter cb2; NULL-check passes (stream_user_data still valid)
#   worker:   ...preempted before the stream_rtp_cb2() call...
#   teardown: transport_detach(): pj_mutex_lock(cb_lock) succeeds at once
#             (cb2 holds nothing), NULLs pointers, unlocks; then
#             pjmedia_stream_destroy frees the stream pool
#   worker:   resume: cbparam.user_data = zrtp->stream_user_data (now NULL
#             or dangling) -> zrtp->stream_rtp_cb2(&cbparam) -> on_rx_rtp
#             dereferences a freed/NULL stream -> SIGSEGV
#
# A NULL-check (patch 46) cannot close this: between check and use the
# pointer can be NULLed *or* stay non-NULL while the stream it points to
# is freed (use-after-free). 2.17 does not save us either -- on_rx_rtp
# moved to stream_imp_common.c but in this tree it still has no
# grp_lock/NULL guard at entry (it dereferences param->user_data->dec
# immediately), so the ZRTP wrapper must provide the synchronisation.
#
# Fix
# ---
# transport_rtp_cb2() acquires cb_lock for its whole body, with the
# patch-46 NULL-check repeated *inside* the lock (so a detach that
# finished before we acquired the lock is observed), and releases cb_lock
# on every return path: the plaintext/SRTP-forward block return, the
# CRC-mismatch return, the ZRTP_MAGIC-mismatch return, and the
# fall-through at end of function. detach's existing pj_mutex_lock(cb_lock)
# now genuinely blocks until the in-flight callback finishes, so the
# stream pool cannot be freed mid-callback.
#
# Deadlock note: cb_lock is held across the forwarded stream_rtp_cb2()
# (on_rx_rtp) and across ZRTP message processing. This is safe as long as
# on_rx_rtp does not, on the same thread, synchronously call
# transport_detach (which would re-enter the non-recursive cb_lock and
# self-deadlock). On the RX path detach is driven from the hangup/teardown
# thread, not from within on_rx_rtp, so the assumption holds; cb_lock is
# also distinct from zrtpMutex, so ZRTP processing's own locking cannot
# invert the order. Re-verify if a future change tears a stream down from
# inside the RTP receive path.
#
#
--- pjsip_orig/pjmedia/src/pjmedia/transport_zrtp.c
+++ pjsip/pjmedia/src/pjmedia/transport_zrtp.c
@@ -897,6 +897,29 @@
     pjmedia_tp_cb_param cbparam;
  
     /*
+     * zrtp comes from param->user_data (the ZRTP transport itself),
+     * which outlives the stream, so it is safe to read before locking.
+     */
+    if (zrtp == NULL)
+        return;
+
+    /*
+     * CB2 LOCK FIX: hold cb_lock across the whole callback body so
+     * transport_detach() -- which takes the same cb_lock before
+     * clearing the callback pointers and letting pjmedia_stream_destroy
+     * free the stream pool -- cannot run between the NULL-check below
+     * and the zrtp->stream_rtp_cb2() call(s) that dereference the
+     * stream through param->user_data.  In 2.17 on_rx_rtp lives in
+     * stream_imp_common.c and still has no grp_lock / NULL guard of its
+     * own, so the wrapper must provide the synchronisation.  The
+     * NULL-check from patch 46 is repeated *inside* the lock so a detach
+     * that completed before we acquired it is observed.  EVERY return
+     * path below must release cb_lock first.
+     * See deps/patches/2.17/49_zrtp_cb2_lock_body.patch.
+     */
+    pj_mutex_lock(zrtp->cb_lock);
+
+    /*
      * NULL-DEREF FIX: drop the packet if the stream has detached.
      * The slave UDP transport detach is not synchronous with
      * in-flight ioqueue callbacks, so a packet that was already
@@ -906,9 +929,10 @@
      * NULL guard and segfaults on a NULL user_data. See
      * deps/patches/2.17/46_zrtp_detach_race_null_deref.patch.
      */
-    if (zrtp == NULL || zrtp->stream_user_data == NULL ||
-        zrtp->stream_rtp_cb2 == NULL)
+    if (zrtp->stream_user_data == NULL || zrtp->stream_rtp_cb2 == NULL) {
+        pj_mutex_unlock(zrtp->cb_lock);
         return;
+    }
 
     pj_assert(zrtp && zrtp->stream_rtp_cb2 && param->pkt);
  
@@ -956,6 +980,7 @@
         if (!zrtp->started && zrtp->enableZrtp)
             pjmedia_transport_zrtp_startZrtp((pjmedia_transport *)zrtp);
 
+        pj_mutex_unlock(zrtp->cb_lock);
         return;
     }
 
@@ -973,6 +998,7 @@
         {
             if (zrtp->cb.show_message)
                 zrtp->cb.show_message(&zrtp->base, zrtp_Warning, zrtp_WarningCRCmismatch);
+            pj_mutex_unlock(zrtp->cb_lock);
             return;
         }
 
@@ -980,8 +1006,10 @@
         magic = pj_ntohl(magic);
 
         // Check if it is really a ZRTP packet, return, no further processing
-        if (magic != ZRTP_MAGIC)
+        if (magic != ZRTP_MAGIC) {
+            pj_mutex_unlock(zrtp->cb_lock);
             return;
+        }
 
         // cover the case if the other party sends _only_ ZRTP packets at the
         // beginning of a session. Start ZRTP in this case as well.
@@ -998,6 +1026,8 @@
         zrtp->peerSSRC = pj_ntohl(zrtp->peerSSRC);
         zrtp_processZrtpMessage(zrtp->zrtpCtx, zrtpMsg, zrtp->peerSSRC, param->size);
     }
+
+    pj_mutex_unlock(zrtp->cb_lock);
 }
 
 /* This is our RTCP callback, that is called by the slave transport when it
