Browse Source

perf(mpsc): rewrite and optimize wait queue (#22)

This branch rewrites the MPSC channel wait queue implementation (again),
in order to improve performance. This undoes a decently large amount of
the perf regression from PR #20.

In particular, I've made the following changes:
* Simplified the design a bit, and reduced the number of CAS loops in
  both the notify and wait paths
* Factored out fast paths (which touch the state variable without
  locking) from the notify and wait operations into separate functions,
  and marked them as `#[inline(always)]`. If we weren't able to perform
  the operation without actually touching the linked list, we call into
  a separate `#[inline(never)]` function that actually locks the list
  and performs the slow path. This means that code that uses these
  functions still has a function call in it, but a few instructions for
  performing a CAS can be inlined and the function call avoided in some
  cases. This *significantly* improves performance!
* Separated the `wait` function into `start_wait` (called the first time
  a node waits) and `continue_wait` (called if the node is woken, to
  handle spurious wakeups). This allows simplifying the code for
  modifying the waker so that we don't have to pass big closures around.
* Other miscellaneous optimizations, such as cache padding some
  variables that should have been cache padded.

## Performance Comparison

These benchmarks were run against the current `main` branch
(f77d53475ca16c5a18cc6e4138f96f449064b004).

### async/mpsc_reusable

```
async/mpsc_reusable/ThingBuf/10
                        time:   [43.953 us 44.522 us 45.057 us]
                        change: [+0.0419% +1.7594% +3.5099%] (p = 0.05 < 0.05)
                        Change within noise threshold.
Found 5 outliers among 100 measurements (5.00%)
  1 (1.00%) low severe
  2 (2.00%) low mild
  1 (1.00%) high mild
  1 (1.00%) high severe
async/mpsc_reusable/ThingBuf/50
                        time:   [140.91 us 142.24 us 143.53 us]
                        change: [-31.201% -29.539% -27.824%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
  1 (1.00%) low mild
  1 (1.00%) high mild
async/mpsc_reusable/ThingBuf/100
                        time:   [250.31 us 255.03 us 259.68 us]
                        change: [-18.966% -17.190% -15.202%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high severe

```

### async/mpsc_integer

```
async/mpsc_integer/ThingBuf/10
                        time:   [208.99 us 215.30 us 221.32 us]
                        change: [+0.6957% +3.8603% +6.9400%] (p = 0.02 < 0.05)
                        Change within noise threshold.
async/mpsc_integer/ThingBuf/50
                        time:   [407.46 us 412.74 us 418.31 us]
                        change: [-39.128% -36.567% -33.267%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 13 outliers among 100 measurements (13.00%)
  2 (2.00%) low mild
  4 (4.00%) high mild
  7 (7.00%) high severe
async/mpsc_integer/ThingBuf/100
                        time:   [534.35 us 541.42 us 548.91 us]
                        change: [-44.820% -41.502% -37.120%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 11 outliers among 100 measurements (11.00%)
  1 (1.00%) low mild
  3 (3.00%) high mild
  7 (7.00%) high severe
```

### async/spsc/try_send_reusable

```
async/spsc/try_send_reusable/ThingBuf/100
                        time:   [12.310 us 12.353 us 12.398 us]
                        thrpt:  [8.0656 Melem/s 8.0952 Melem/s 8.1236 Melem/s]
                 change:
                        time:   [-7.5146% -7.1996% -6.8566%] (p = 0.00 < 0.05)
                        thrpt:  [+7.3613% +7.7582% +8.1252%]
                        Performance has improved.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high mild
async/spsc/try_send_reusable/ThingBuf/500
                        time:   [46.691 us 46.778 us 46.871 us]
                        thrpt:  [10.668 Melem/s 10.689 Melem/s 10.709 Melem/s]
                 change:
                        time:   [-9.4767% -9.2760% -9.0811%] (p = 0.00 < 0.05)
                        thrpt:  [+9.9881% +10.224% +10.469%]
                        Performance has improved.
Found 4 outliers among 100 measurements (4.00%)
  4 (4.00%) high mild
async/spsc/try_send_reusable/ThingBuf/1000
                        time:   [89.763 us 90.757 us 91.843 us]
                        thrpt:  [10.888 Melem/s 11.018 Melem/s 11.140 Melem/s]
                 change:
                        time:   [-9.4302% -8.8637% -8.2018%] (p = 0.00 < 0.05)
                        thrpt:  [+8.9346% +9.7257% +10.412%]
                        Performance has improved.
Found 12 outliers among 100 measurements (12.00%)
  1 (1.00%) low mild
  3 (3.00%) high mild
  8 (8.00%) high severe
async/spsc/try_send_reusable/ThingBuf/5000
                        time:   [415.34 us 417.89 us 420.42 us]
                        thrpt:  [11.893 Melem/s 11.965 Melem/s 12.038 Melem/s]
                 change:
                        time:   [-13.113% -12.774% -12.411%] (p = 0.00 < 0.05)
                        thrpt:  [+14.170% +14.644% +15.093%]
                        Performance has improved.
Found 7 outliers among 100 measurements (7.00%)
  7 (7.00%) high mild
async/spsc/try_send_reusable/ThingBuf/10000
                        time:   [847.35 us 848.63 us 849.98 us]
                        thrpt:  [11.765 Melem/s 11.784 Melem/s 11.802 Melem/s]
                 change:
                        time:   [-11.345% -10.820% -10.388%] (p = 0.00 < 0.05)
                        thrpt:  [+11.592% +12.133% +12.796%]
                        Performance has improved.
Found 8 outliers among 100 measurements (8.00%)
  5 (5.00%) low mild
  2 (2.00%) high mild
  1 (1.00%) high severe
```

### async/spsc/try_send_integer

```
async/spsc/try_send_integer/ThingBuf/100
                        time:   [7.2254 us 7.2467 us 7.2690 us]
                        thrpt:  [13.757 Melem/s 13.799 Melem/s 13.840 Melem/s]
                 change:
                        time:   [-13.292% -12.912% -12.520%] (p = 0.00 < 0.05)
                        thrpt:  [+14.312% +14.826% +15.330%]
                        Performance has improved.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high mild
async/spsc/try_send_integer/ThingBuf/500
                        time:   [34.358 us 34.477 us 34.582 us]
                        thrpt:  [14.458 Melem/s 14.503 Melem/s 14.553 Melem/s]
                 change:
                        time:   [-18.539% -18.312% -18.072%] (p = 0.00 < 0.05)
                        thrpt:  [+22.058% +22.417% +22.758%]
                        Performance has improved.
async/spsc/try_send_integer/ThingBuf/1000
                        time:   [69.107 us 69.273 us 69.434 us]
                        thrpt:  [14.402 Melem/s 14.436 Melem/s 14.470 Melem/s]
                 change:
                        time:   [-17.759% -17.604% -17.444%] (p = 0.00 < 0.05)
                        thrpt:  [+21.130% +21.365% +21.594%]
                        Performance has improved.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high mild
async/spsc/try_send_integer/ThingBuf/5000
                        time:   [349.44 us 353.41 us 357.81 us]
                        thrpt:  [13.974 Melem/s 14.148 Melem/s 14.309 Melem/s]
                 change:
                        time:   [-14.832% -14.252% -13.447%] (p = 0.00 < 0.05)
                        thrpt:  [+15.537% +16.621% +17.415%]
                        Performance has improved.
Found 13 outliers among 100 measurements (13.00%)
  5 (5.00%) high mild
  8 (8.00%) high severe
async/spsc/try_send_integer/ThingBuf/10000
                        time:   [712.89 us 732.58 us 754.24 us]
                        thrpt:  [13.258 Melem/s 13.650 Melem/s 14.027 Melem/s]
                 change:
                        time:   [-16.082% -15.161% -14.129%] (p = 0.00 < 0.05)
                        thrpt:  [+16.454% +17.870% +19.164%]
                        Performance has improved.
Found 7 outliers among 100 measurements (7.00%)
  2 (2.00%) high mild
  5 (5.00%) high severe
```

I'm actually not really sure why this also improved the `try_send`
benchmarks, which don't touch the wait queue...but I'll take it!

Signed-off-by: Eliza Weisman <[email protected]>
Eliza Weisman 3 years ago
parent
commit
8c882b0f40
11 changed files with 514 additions and 267 deletions
  1. 1 0
      Cargo.toml
  2. 5 1
      bench/Cargo.toml
  3. 3 1
      src/lib.rs
  4. 1 45
      src/mpsc.rs
  5. 62 50
      src/mpsc/async_impl.rs
  6. 28 13
      src/mpsc/sync.rs
  7. 2 2
      src/util.rs
  8. 7 1
      src/util/mutex.rs
  9. 15 1
      src/wait.rs
  10. 11 12
      src/wait/cell.rs
  11. 379 141
      src/wait/queue.rs

+ 1 - 0
Cargo.toml

@@ -18,6 +18,7 @@ default = ["std"]
 
 [dependencies]
 pin-project = "1"
+parking_lot = { version = "0.11", optional = true }
 
 [dev-dependencies]
 tokio = { version = "1.14.0", features = ["rt", "rt-multi-thread", "macros", "sync"] }

+ 5 - 1
bench/Cargo.toml

@@ -7,18 +7,22 @@ publish = false
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [features]
+default = ["parking_lot"]
 # These feature flags can be disabled if we don't want to run comparison
 # benchmarks, such as when just comparing two `thingbuf` versions.
 comparisons = ["crossbeam", "async-std", "futures", "tokio-sync", "std-sync"]
 tokio-sync = ["tokio/sync"]
 std-sync = []
 
+# Use parking_lot mutexes in `thingbuf` 
+parking_lot = ["thingbuf/parking_lot"]
+
 [dependencies]
 thingbuf = { path = ".." }
 criterion = { version = "0.3.5", features = ["async_tokio"] }
 
 # for comparison benchmarks
-tokio = { version = "1.14.0", features = ["rt", "rt-multi-thread", "sync"] }
+tokio = { version = "1.14.0", features = ["rt", "rt-multi-thread", "sync", "parking_lot"] }
 crossbeam = { version = "0.8.1", optional = true }
 async-std = { version = "1", optional = true }
 futures = { version = "0.3", optional = true }

+ 3 - 1
src/lib.rs

@@ -106,7 +106,7 @@ impl Core {
         }
     }
 
-    #[inline]
+    #[inline(always)]
     fn idx_gen(&self, val: usize) -> (usize, usize) {
         (val & self.idx_mask, val & self.gen_mask)
     }
@@ -138,6 +138,7 @@ impl Core {
         test_dbg!(self.tail.fetch_or(self.closed, SeqCst) & self.closed == 0)
     }
 
+    #[inline(always)]
     fn push_ref<'slots, T, S>(
         &self,
         slots: &'slots S,
@@ -226,6 +227,7 @@ impl Core {
         }
     }
 
+    #[inline(always)]
     fn pop_ref<'slots, T, S>(&self, slots: &'slots S) -> Result<Ref<'slots, T>, mpsc::TrySendError>
     where
         S: ops::Index<usize, Output = Slot<T>> + ?Sized,

+ 1 - 45
src/mpsc.rs

@@ -12,8 +12,7 @@
 
 use crate::{
     loom::{atomic::AtomicUsize, hint},
-    util::Backoff,
-    wait::{queue, Notify, WaitCell, WaitQueue, WaitResult},
+    wait::{Notify, WaitCell, WaitQueue, WaitResult},
     Ref, ThingBuf,
 };
 use core::fmt;
@@ -113,49 +112,6 @@ impl<T: Default, N: Notify + Unpin> Inner<T, N> {
         }
     }
 
-    /// Performs one iteration of the `send_ref` loop.
-    ///
-    /// The loop itself has to be written in the actual `send` method's
-    /// implementation, rather than on `inner`, because it might be async and
-    /// may yield, or might park the thread.
-    fn poll_send_ref(
-        &self,
-        node: Pin<&mut queue::Waiter<N>>,
-        mut register: impl FnMut(&mut Option<N>),
-    ) -> Poll<Result<SendRefInner<'_, T, N>, Closed>> {
-        let mut backoff = Backoff::new();
-        let mut node = Some(node);
-        // try to send a few times in a loop, in case the receiver notifies us
-        // right before we park.
-        loop {
-            // try to reserve a send slot, returning if we succeeded or if the
-            // queue was closed.
-            match self.try_send_ref() {
-                Ok(slot) => return Poll::Ready(Ok(slot)),
-                Err(TrySendError::Closed(_)) => return Poll::Ready(Err(Closed(()))),
-                Err(_) => {}
-            }
-
-            // try to push a waiter
-            let pushed_waiter = self.tx_wait.wait(&mut node, &mut register);
-
-            match test_dbg!(pushed_waiter) {
-                WaitResult::Closed => {
-                    // the channel closed while we were registering the waiter!
-                    return Poll::Ready(Err(Closed(())));
-                }
-                WaitResult::Wait => {
-                    // okay, we are now queued to wait. gotosleep!
-                    return Poll::Pending;
-                }
-                WaitResult::Notified => {
-                    // we consumed a queued notification. try again...
-                    backoff.spin_yield();
-                }
-            }
-        }
-    }
-
     /// Performs one iteration of the `recv_ref` loop.
     ///
     /// The loop itself has to be written in the actual `send` method's

+ 62 - 50
src/mpsc/async_impl.rs

@@ -77,66 +77,78 @@ impl<T: Default> Sender<T> {
         #[pin_project::pin_project(PinnedDrop)]
         struct SendRefFuture<'sender, T> {
             tx: &'sender Sender<T>,
-            queued: bool,
+            state: State,
             #[pin]
             waiter: queue::Waiter<Waker>,
         }
 
+        #[derive(Debug, Copy, Clone, Eq, PartialEq)]
+        enum State {
+            Start,
+            Waiting,
+            Done,
+        }
+
         impl<'sender, T: Default + 'sender> Future for SendRefFuture<'sender, T> {
             type Output = Result<SendRef<'sender, T>, Closed>;
 
             fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
                 test_println!("SendRefFuture::poll({:p})", self);
-                // perform one send ref loop iteration
-                let res = {
+
+                loop {
                     let this = self.as_mut().project();
-                    this.tx.inner.poll_send_ref(this.waiter, |waker| {
-                        let my_waker = cx.waker();
-
-                        // If there's already a waker in the node, we might have
-                        // been woken spuriously for some reason. In that case,
-                        // make sure that the waker in the node will wake the
-                        // waker that was passed in on *this* poll --- the
-                        // future may have moved to another task or something!
-                        if let Some(waker) = waker.as_mut() {
-                            if test_dbg!(!waker.will_wake(my_waker)) {
-                                test_println!(
-                                    "poll_send_ref -> re-registering waker {:?}",
-                                    my_waker
-                                );
-                                *waker = my_waker.clone();
+                    let node = this.waiter;
+                    match test_dbg!(*this.state) {
+                        State::Start => {
+                            match this.tx.try_send_ref() {
+                                Ok(slot) => return Poll::Ready(Ok(slot)),
+                                Err(TrySendError::Closed(_)) => {
+                                    return Poll::Ready(Err(Closed(())))
+                                }
+                                Err(_) => {}
                             }
-                            return;
-                        }
 
-                        // Otherwise, we are registering this task for the first
-                        // time.
-                        test_println!("poll_send_ref -> registering initial waker {:?}", my_waker);
-                        *waker = Some(my_waker.clone());
-                        *this.queued = true;
-                    })
-                };
-                res.map(|ready| {
-                    let this = self.as_mut().project();
-                    if test_dbg!(*this.queued) {
-                        // If the node was ever in the queue, we have to make
-                        // sure we're *absolutely certain* it isn't still in the
-                        // queue before we say it's okay to drop the node
-                        // without removing it from the linked list. Check to
-                        // make sure we were woken by the queue, and not by a
-                        // spurious wakeup.
-                        //
-                        // This means we *may* be a little bit aggressive about
-                        // locking the wait queue to make sure the node is
-                        // removed, but that's better than leaving dangling
-                        // pointers in the queue...
-                        *this.queued = test_dbg!(!this
-                            .waiter
-                            .was_woken_from_queue
-                            .swap(false, Ordering::AcqRel));
+                            let start_wait = this.tx.inner.tx_wait.start_wait(node, cx.waker());
+
+                            match test_dbg!(start_wait) {
+                                WaitResult::Closed => {
+                                    // the channel closed while we were registering the waiter!
+                                    *this.state = State::Done;
+                                    return Poll::Ready(Err(Closed(())));
+                                }
+                                WaitResult::Wait => {
+                                    // okay, we are now queued to wait.
+                                    // gotosleep!
+                                    *this.state = State::Waiting;
+                                    return Poll::Pending;
+                                }
+                                WaitResult::Notified => continue,
+                            }
+                        }
+                        State::Waiting => {
+                            let continue_wait =
+                                this.tx.inner.tx_wait.continue_wait(node, cx.waker());
+
+                            match test_dbg!(continue_wait) {
+                                WaitResult::Closed => {
+                                    *this.state = State::Done;
+                                    return Poll::Ready(Err(Closed(())));
+                                }
+                                WaitResult::Wait => return Poll::Pending,
+                                WaitResult::Notified => {
+                                    *this.state = State::Done;
+                                }
+                            }
+                        }
+                        State::Done => match this.tx.try_send_ref() {
+                            Ok(slot) => return Poll::Ready(Ok(slot)),
+                            Err(TrySendError::Closed(_)) => return Poll::Ready(Err(Closed(()))),
+                            Err(_) => {
+                                *this.state = State::Start;
+                            }
+                        },
                     }
-                    ready.map(SendRef)
-                })
+                }
             }
         }
 
@@ -144,8 +156,8 @@ impl<T: Default> Sender<T> {
         impl<T> PinnedDrop for SendRefFuture<'_, T> {
             fn drop(self: Pin<&mut Self>) {
                 test_println!("SendRefFuture::drop({:p})", self);
-                if test_dbg!(self.queued) {
-                    let this = self.project();
+                let this = self.project();
+                if test_dbg!(*this.state) == State::Waiting && test_dbg!(this.waiter.is_linked()) {
                     this.waiter.remove(&this.tx.inner.tx_wait)
                 }
             }
@@ -153,7 +165,7 @@ impl<T: Default> Sender<T> {
 
         SendRefFuture {
             tx: self,
-            queued: false,
+            state: State::Start,
             waiter: queue::Waiter::new(),
         }
         .await

+ 28 - 13
src/mpsc/sync.rs

@@ -55,28 +55,43 @@ impl<T: Default> Sender<T> {
     }
 
     pub fn send_ref(&self) -> Result<SendRef<'_, T>, Closed> {
+        // fast path: avoid getting the thread and constructing the node if the
+        // slot is immediately ready.
+        match self.inner.try_send_ref() {
+            Ok(slot) => return Ok(SendRef(slot)),
+            Err(TrySendError::Closed(_)) => return Err(Closed(())),
+            _ => {}
+        }
+
         let mut waiter = queue::Waiter::new();
+        let mut unqueued = true;
+        let thread = thread::current();
         loop {
-            // perform one send ref loop iteration
-
-            let waiter = unsafe {
+            let node = unsafe {
                 // Safety: in this case, it's totally safe to pin the waiter, as
                 // it is owned uniquely by this function, and it cannot possibly
                 // be moved while this thread is parked.
                 Pin::new_unchecked(&mut waiter)
             };
-            if let Poll::Ready(result) = self.inner.poll_send_ref(waiter, |thread| {
-                if thread.is_none() {
-                    let current = thread::current();
-                    test_println!("registering {:?}", current);
-                    *thread = Some(current);
+
+            let wait = if unqueued {
+                test_dbg!(self.inner.tx_wait.start_wait(node, &thread))
+            } else {
+                test_dbg!(self.inner.tx_wait.continue_wait(node, &thread))
+            };
+
+            match wait {
+                WaitResult::Closed => return Err(Closed(())),
+                WaitResult::Notified => match self.inner.try_send_ref() {
+                    Ok(slot) => return Ok(SendRef(slot)),
+                    Err(TrySendError::Closed(_)) => return Err(Closed(())),
+                    _ => {}
+                },
+                WaitResult::Wait => {
+                    unqueued = false;
+                    thread::park();
                 }
-            }) {
-                return result.map(SendRef);
             }
-
-            // if that iteration failed, park the thread.
-            thread::park();
         }
     }
 

+ 2 - 2
src/util.rs

@@ -28,7 +28,7 @@ impl Backoff {
         Self(0)
     }
 
-    #[inline]
+    #[inline(always)]
     pub(crate) fn spin(&mut self) {
         #[cfg(not(all(loom, test)))]
         for _ in 0..test_dbg!(1 << self.0.min(Self::MAX_SPINS)) {
@@ -46,7 +46,7 @@ impl Backoff {
         }
     }
 
-    #[inline]
+    #[inline(always)]
     pub(crate) fn spin_yield(&mut self) {
         if self.0 <= Self::MAX_SPINS || cfg!(not(any(feature = "std", test))) {
             #[cfg(not(all(loom, test)))]

+ 7 - 1
src/util/mutex.rs

@@ -1,5 +1,5 @@
 feature! {
-    #![feature = "std"]
+    #![all(feature = "std", not(feature = "parking_lot"))]
     pub(crate) use self::std_impl::*;
     mod std_impl;
 }
@@ -9,3 +9,9 @@ pub(crate) use self::spin_impl::*;
 
 #[cfg(any(not(feature = "std"), test))]
 mod spin_impl;
+
+feature! {
+    #![all(feature = "std", feature = "parking_lot")]
+    #[allow(unused_imports)]
+    pub(crate) use parking_lot::{Mutex, MutexGuard};
+}

+ 15 - 1
src/wait.rs

@@ -54,21 +54,35 @@ pub(crate) enum WaitResult {
     Notified,
 }
 
-pub(crate) trait Notify: UnwindSafe + fmt::Debug {
+pub(crate) trait Notify: UnwindSafe + fmt::Debug + Clone {
     fn notify(self);
+
+    fn same(&self, other: &Self) -> bool;
 }
 
 #[cfg(feature = "std")]
 impl Notify for thread::Thread {
+    #[inline]
     fn notify(self) {
         test_println!("NOTIFYING {:?} (from {:?})", self, thread::current());
         self.unpark();
     }
+
+    #[inline]
+    fn same(&self, other: &Self) -> bool {
+        other.id() == self.id()
+    }
 }
 
 impl Notify for Waker {
+    #[inline]
     fn notify(self) {
         test_println!("WAKING TASK {:?} (from {:?})", self, thread::current());
         self.wake();
     }
+
+    #[inline]
+    fn same(&self, other: &Self) -> bool {
+        other.will_wake(self)
+    }
 }

+ 11 - 12
src/wait/cell.rs

@@ -6,7 +6,10 @@ use crate::{
         },
         cell::UnsafeCell,
     },
-    util::panic::{self, RefUnwindSafe, UnwindSafe},
+    util::{
+        panic::{self, RefUnwindSafe, UnwindSafe},
+        CachePadded,
+    },
     wait::{Notify, WaitResult},
 };
 use core::{fmt, ops};
@@ -29,7 +32,7 @@ use core::{fmt, ops};
 ///
 /// [`AtomicWaker`]: https://github.com/tokio-rs/tokio/blob/09b770c5db31a1f35631600e1d239679354da2dd/tokio/src/sync/task/atomic_waker.rs
 pub(crate) struct WaitCell<T> {
-    lock: AtomicUsize,
+    lock: CachePadded<AtomicUsize>,
     waiter: UnsafeCell<Option<T>>,
 }
 
@@ -42,7 +45,7 @@ impl<T> WaitCell<T> {
     #[cfg(not(all(loom, test)))]
     pub(crate) const fn new() -> Self {
         Self {
-            lock: AtomicUsize::new(State::WAITING.0),
+            lock: CachePadded(AtomicUsize::new(State::WAITING.0)),
             waiter: UnsafeCell::new(None),
         }
     }
@@ -50,7 +53,7 @@ impl<T> WaitCell<T> {
     #[cfg(all(loom, test))]
     pub(crate) fn new() -> Self {
         Self {
-            lock: AtomicUsize::new(State::WAITING.0),
+            lock: CachePadded(AtomicUsize::new(State::WAITING.0)),
             waiter: UnsafeCell::new(None),
         }
     }
@@ -140,20 +143,16 @@ impl<T: Notify> WaitCell<T> {
     }
 
     pub(crate) fn notify(&self) -> bool {
-        self.notify2(false)
+        self.notify2(State::WAITING)
     }
 
     pub(crate) fn close_tx(&self) {
-        self.notify2(true);
+        self.notify2(State::TX_CLOSED);
     }
 
-    fn notify2(&self, close: bool) -> bool {
+    fn notify2(&self, close: State) -> bool {
         test_println!("notifying; close={:?};", close);
-        let bits = if close {
-            State::NOTIFYING | State::TX_CLOSED
-        } else {
-            State::NOTIFYING
-        };
+        let bits = State::NOTIFYING | close;
         test_dbg!(bits);
         if test_dbg!(self.fetch_or(bits, AcqRel)) == State::WAITING {
             // we have the lock!

+ 379 - 141
src/wait/queue.rs

@@ -1,6 +1,6 @@
 use crate::{
     loom::{
-        atomic::{AtomicBool, AtomicUsize, Ordering::*},
+        atomic::{AtomicUsize, Ordering::*},
         cell::UnsafeCell,
     },
     util::{mutex::Mutex, CachePadded},
@@ -57,13 +57,27 @@ use core::{fmt, marker::PhantomPinned, pin::Pin, ptr::NonNull};
 /// [2]: https://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue
 #[derive(Debug)]
 pub(crate) struct WaitQueue<T> {
-    /// The wait queue's state variable. The first bit indicates whether the
-    /// queue is closed; the remainder is a counter of notifications assigned to
-    /// the queue because no waiters were currently available to be woken.
+    /// The wait queue's state variable.
     ///
-    /// These stored notifications are "consumed" when new waiters are
-    /// registered; those waiters will be woken immediately rather than being
-    /// enqueued to wait.
+    /// The queue is always in one of the following states:
+    ///
+    /// - `EMPTY`: No waiters are queued, and there is no pending notification.
+    ///   Waiting while the queue is in this state will enqueue the waiter;
+    ///   notifying while in this state will store a pending notification in the
+    ///   queue, transitioning to the `WAKING` state.
+    ///
+    /// - `WAITING`: There are one or more waiters in the queue. Waiting while
+    ///   the queue is in this state will not transition the state. Waking while
+    ///   in this state will wake the first waiter in the queue; if this empties
+    ///   the queue, then the queue will transition to the `EMPTY` state.
+    ///
+    /// - `WAKING`: The queue has a stored notification. Waiting while the queue
+    ///   is in this state will consume the pending notification *without*
+    ///   enqueueing the waiter and transition the queue to the `EMPTY` state.
+    ///   Waking while in this state will leave the queue in this state.
+    ///
+    /// - `CLOSED`: The queue is closed. Waiting while in this state will return
+    ///   [`WaitResult::Closed`] without transitioning the queue's state.
     state: CachePadded<AtomicUsize>,
     /// The linked list of waiters.
     ///
@@ -77,16 +91,44 @@ pub(crate) struct WaitQueue<T> {
     /// modified through the list, so the lock must be held when modifying the
     /// node.
     ///
-    /// A spinlock is used on `no_std` platforms; [`std::sync::Mutex`] is used
-    /// when the standard library is available.
+    /// A spinlock is used on `no_std` platforms; [`std::sync::Mutex`] or
+    /// `parking_lot::Mutex` are used when the standard library is available
+    /// (depending on feature flags).
     list: Mutex<List<T>>,
 }
 
 /// A waiter node which may be linked into a wait queue.
 #[derive(Debug)]
 pub(crate) struct Waiter<T> {
+    /// The waiter's state variable.
+    ///
+    /// A waiter is always in one of the following states:
+    ///
+    /// - `EMPTY`: The waiter is not linked in the queue, and does not have a
+    ///   `Thread`/`Waker`.
+    ///
+    /// - `WAITING`: The waiter is linked in the queue and has a
+    ///   `Thread`/`Waker`.
+    ///
+    /// - `WAKING`: The waiter has been notified by the wait queue. If it is in
+    ///   this state, it is *not* linked into the queue, and does not have a
+    ///   `Thread`/`Waker`.
+    ///
+    /// - `WAKING`: The waiter has been notified because the wait queue closed.
+    ///   If it is in this state, it is *not* linked into the queue, and does
+    ///   not have a `Thread`/`Waker`.
+    ///
+    /// This may be inspected without holding the lock; it can be used to
+    /// determine whether the lock must be acquired.
+    state: CachePadded<AtomicUsize>,
+
+    /// The linked list node and stored `Thread`/`Waker`.
+    ///
+    /// # Safety
+    ///
+    /// This `UnsafeCell` may only be accessed while holding the `Mutex` around
+    /// the wait queue's linked list!
     node: UnsafeCell<Node<T>>,
-    pub(crate) was_woken_from_queue: AtomicBool,
 }
 
 #[derive(Debug)]
@@ -109,9 +151,10 @@ struct List<T> {
     tail: Link<Waiter<T>>,
 }
 
-const CLOSED: usize = 1;
-const ONE_QUEUED: usize = 2;
 const EMPTY: usize = 0;
+const WAITING: usize = 1;
+const WAKING: usize = 2;
+const CLOSED: usize = 3;
 
 impl<T: Notify + Unpin> WaitQueue<T> {
     pub(crate) fn new() -> Self {
@@ -121,87 +164,186 @@ impl<T: Notify + Unpin> WaitQueue<T> {
         }
     }
 
-    pub(crate) fn wait(
-        &self,
-        waiter: &mut Option<Pin<&mut Waiter<T>>>,
-        register: impl FnOnce(&mut Option<T>),
-    ) -> WaitResult {
-        test_println!("WaitQueue::push_waiter()");
-
-        // First, go ahead and check if the queue has been closed. This is
-        // necessary even if `waiter` is `None`, as the waiter may already be
-        // queued, and just checking if the list was closed.
-        // TODO(eliza): that actually kind of sucks lol...
-        // Is there at least one queued notification assigned to the wait
-        // queue? If so, try to consume that now, rather than waiting.
-        match test_dbg!(self
-            .state
-            .compare_exchange(ONE_QUEUED, EMPTY, AcqRel, Acquire))
-        {
+    /// Start waiting for a notification.
+    ///
+    /// If the queue has a stored notification, this consumes it and returns
+    /// [`WaitResult::Notified`] without adding the waiter to the queue. If the
+    /// queue is closed, this returns [`WaitResult::Closed`] without adding the
+    /// waiter to the queue. Otherwise, the waiter is enqueued, and this returns
+    /// [`WaitResult::Wait`].
+    #[inline(always)]
+    pub(crate) fn start_wait(&self, node: Pin<&mut Waiter<T>>, waiter: &T) -> WaitResult {
+        test_println!("WaitQueue::start_wait({:p})", node);
+
+        // Optimistically, acquire a stored notification before trying to lock
+        // the wait list.
+        match test_dbg!(self.state.compare_exchange(WAKING, EMPTY, SeqCst, SeqCst)) {
             Ok(_) => return WaitResult::Notified,
             Err(CLOSED) => return WaitResult::Closed,
-            Err(_state) => debug_assert_eq!(_state, EMPTY),
+            Err(_) => {}
         }
 
-        // If we were actually called with a real waiter, try to queue the node.
-        if test_dbg!(waiter.is_some()) {
-            // There are no queued notifications to consume, and the queue is
-            // still open. Therefore, it's time to actually push the waiter to
-            // the queue...finally lol :)
+        // Slow path: the queue is not closed, and we failed to consume a stored
+        // notification. We need to acquire the lock and enqueue the waiter.
+        self.start_wait_slow(node, waiter)
+    }
 
-            // Grab the lock...
-            let mut list = self.list.lock();
+    /// Slow path of `start_wait`: acquires the linked list lock, and adds the
+    /// waiter to the queue.
+    #[cold]
+    #[inline(never)]
+    fn start_wait_slow(&self, node: Pin<&mut Waiter<T>>, waiter: &T) -> WaitResult {
+        test_println!("WaitQueue::start_wait_slow({:p})", node);
+        // There are no queued notifications to consume, and the queue is
+        // still open. Therefore, it's time to actually push the waiter to
+        // the queue...finally lol :)
+
+        // Grab the lock...
+        let mut list = self.list.lock();
+        // Reload the queue's state, as it may have changed while we were
+        // waiting to lock the linked list.
+        let mut state = self.state.load(Acquire);
+
+        loop {
+            match test_dbg!(state) {
+                // The queue is empty: transition the state to WAITING, as we
+                // are adding a waiter.
+                EMPTY => {
+                    match test_dbg!(self
+                        .state
+                        .compare_exchange_weak(EMPTY, WAITING, SeqCst, SeqCst))
+                    {
+                        Ok(_) => break,
+                        Err(actual) => {
+                            debug_assert!(actual == WAKING || actual == CLOSED);
+                            state = actual;
+                        }
+                    }
+                }
 
-            // Okay, we have the lock...but what if someone changed the state
-            // WHILE we were waiting to acquire the lock? isn't concurrent
-            // programming great? :) :) :) :) :)
-            // Try to consume a queued notification *again* in case any were
-            // assigned to the queue while we were waiting to acquire the lock.
-            match test_dbg!(self
-                .state
-                .compare_exchange(ONE_QUEUED, EMPTY, AcqRel, Acquire))
-            {
-                Ok(_) => return WaitResult::Notified,
-                Err(CLOSED) => return WaitResult::Closed,
-                Err(_state) => debug_assert_eq!(_state, EMPTY),
+                // The queue was woken while we were waiting to acquire the
+                // lock. Attempt to consume the wakeup.
+                WAKING => {
+                    match test_dbg!(self
+                        .state
+                        .compare_exchange_weak(WAKING, EMPTY, SeqCst, SeqCst))
+                    {
+                        // Consumed the wakeup!
+                        Ok(_) => return WaitResult::Notified,
+                        Err(actual) => {
+                            debug_assert!(actual == EMPTY || actual == CLOSED);
+                            state = actual;
+                        }
+                    }
+                }
+
+                // The queue closed while we were waiting to acquire the lock;
+                // we're done here!
+                CLOSED => return WaitResult::Closed,
+
+                // The queue is already in the WAITING state, so we don't need
+                // to mess with it.
+                _state => {
+                    debug_assert_eq!(_state, WAITING,
+                        "start_wait_slow: unexpected state value {:?} (expected WAITING). this is a bug!",
+                        _state,
+                    );
+                    break;
+                }
+            }
+        }
+
+        // Time to wait! Store the waiter in the node, advance the node's state
+        // to Waiting, and add it to the queue.
+
+        node.with_node(&mut *list, |node| {
+            let _prev = node.waiter.replace(waiter.clone());
+            debug_assert!(
+                _prev.is_none(),
+                "start_wait_slow: called with a node that already had a waiter!"
+            );
+        });
+
+        let _prev_state = test_dbg!(node.state.swap(WAITING, Release));
+        debug_assert!(
+            _prev_state == EMPTY || _prev_state == WAKING,
+            "start_wait_slow: called with a node that was not empty ({}) or woken ({})! actual={}",
+            EMPTY,
+            WAKING,
+            _prev_state,
+        );
+        list.enqueue(node);
+
+        WaitResult::Wait
+    }
+
+    /// Continue waiting for a notification.
+    ///
+    /// This is called when a waiter has been woken. It determines if the
+    /// node was woken from the queue, or if the wakeup was spurious. If the
+    /// wakeup was from the queue, this returns [`WaitResult::Notified`] or
+    /// [`WaitResult::Closed`]. Otherwise, if the wakeup was spurious, this will
+    /// lock the queue and check if the node's waiter needs to be updated.
+    #[inline(always)]
+    pub(crate) fn continue_wait(&self, node: Pin<&mut Waiter<T>>, my_waiter: &T) -> WaitResult {
+        test_println!("WaitQueue::continue_wait({:p})", node);
+
+        // Fast path: check if the node was woken from the queue.
+        let state = test_dbg!(node.state.load(Acquire));
+        match state {
+            WAKING => return WaitResult::Notified,
+            CLOSED => return WaitResult::Closed,
+            _state => {
+                debug_assert_eq!(
+                    _state, WAITING,
+                    "continue_wait should not be called unless the node has been enqueued"
+                );
+            }
+        }
+
+        // Slow path: received a spurious wakeup. We must lock the queue so that
+        // we can potentially modify the node's waiter.
+        self.continue_wait_slow(node, my_waiter)
+    }
+
+    /// Slow path for `continue_wait`: locks the linked list and updates the
+    /// node with a new waiter.
+    #[cold]
+    #[inline(never)]
+    fn continue_wait_slow(&self, node: Pin<&mut Waiter<T>>, my_waiter: &T) -> WaitResult {
+        test_println!("WaitQueue::continue_wait_slow({:p})", node);
+
+        // If the waiting task/thread was woken but no wakeup was assigned to
+        // the node, we may need to update the node with a new waiter.
+        // Therefore, lock the queue in order to modify the node.
+        let mut list = self.list.lock();
+
+        // The node may have been woken while we were waiting to acquire the
+        // lock. If so, check the new state.
+        match test_dbg!(node.state.load(Acquire)) {
+            WAKING => return WaitResult::Notified,
+            CLOSED => return WaitResult::Closed,
+            _state => {
+                debug_assert_eq!(
+                    _state, WAITING,
+                    "continue_wait_slow should not be called unless the node has been enqueued"
+                );
             }
+        }
 
-            // We didn't consume a queued notification. it is now, finally, time
-            // to actually put the waiter in the linked list. wasn't that fun?
-
-            if let Some(waiter) = waiter.take() {
-                // Now that we have the lock, register the `Waker` or `Thread`
-                // to
-                let should_queue = unsafe {
-                    test_println!("WaitQueue::push_waiter -> registering {:p}", waiter);
-                    // Safety: the waker can only be registered while holding
-                    // the wait queue lock. We are holding the lock, so no one
-                    // else will try to touch the waker until we're done.
-                    waiter.with_node(|node| {
-                        // Does the node need to be added to the wait queue? If
-                        // it currently has a waiter (prior to registering),
-                        // then we know it's already in the queue. Otherwise, if
-                        // it doesn't have a waiter, it is either waiting for
-                        // the first time, or it is re-registering after a
-                        // notification that it wasn't able to consume (for some
-                        // reason).
-                        let should_queue = node.waiter.is_none();
-                        register(&mut node.waiter);
-                        should_queue
-                    })
-                };
-                if test_dbg!(should_queue) {
-                    test_println!("WaitQueue::push_waiter -> pushing {:p}", waiter);
-                    test_dbg!(waiter.was_woken_from_queue.swap(false, AcqRel));
-                    list.push_front(waiter);
+        // Okay, we were not woken and need to continue waiting. It may be
+        // necessary to update the waiter with a new waiter (in practice, this
+        // is only necessary in async).
+        node.with_node(&mut *list, |node| {
+            if let Some(ref mut waiter) = node.waiter {
+                if !waiter.same(my_waiter) {
+                    *waiter = my_waiter.clone();
                 }
             } else {
-                // XXX(eliza): in practice we can't ever get here because of the
-                // `if` above. this should probably be `unreachable_unchecked`
-                // but i'm a coward...
-                unreachable!("this could be unchecked...")
+                // XXX(eliza): This branch should _probably_ never occur...
+                node.waiter = Some(my_waiter.clone());
             }
-        }
+        });
 
         WaitResult::Wait
     }
@@ -211,29 +353,82 @@ impl<T: Notify + Unpin> WaitQueue<T> {
     ///
     /// If a waiter was popped from the queue, returns `true`. Otherwise, if the
     /// notification was assigned to the queue, returns `false`.
+    #[inline(always)]
     pub(crate) fn notify(&self) -> bool {
         test_println!("WaitQueue::notify()");
+
+        // Fast path: If the queue is empty, we can simply assign the
+        // notification to the queue.
+        let mut state = self.state.load(Acquire);
+
+        while test_dbg!(state) == WAKING || state == EMPTY {
+            match test_dbg!(self
+                .state
+                .compare_exchange_weak(state, WAKING, SeqCst, SeqCst))
+            {
+                // No waiters are currently waiting, assign the notification to
+                // the queue to be consumed by the next wait attempt.
+                Ok(_) => return false,
+                Err(actual) => state = actual,
+            }
+        }
+
+        // Slow path: there are waiters in the queue, so we must acquire the
+        // lock and wake one of them.
+        self.notify_slow(state)
+    }
+
+    /// Slow path for `notify`: acquire the lock on the linked list, dequeue a
+    /// waiter, and notify it.
+    #[cold]
+    #[inline(never)]
+    fn notify_slow(&self, state: usize) -> bool {
+        test_println!("WaitQueue::notify_slow(state: {})", state);
+
         let mut list = self.list.lock();
-        if let Some(node) = list.pop_back() {
-            drop(list);
-            test_println!("notifying {:?}", node);
-            node.notify();
-            true
-        } else {
-            test_println!("no waiters to notify...");
-            // This can be relaxed because we're holding the lock.
-            test_dbg!(self.state.store(ONE_QUEUED, Relaxed));
-            false
+        match state {
+            EMPTY | WAKING => {
+                if let Err(actual) = self.state.compare_exchange(state, WAKING, SeqCst, SeqCst) {
+                    debug_assert!(actual == EMPTY || actual == WAKING);
+                    self.state.store(WAKING, SeqCst);
+                }
+                false
+            }
+            WAITING => {
+                let waiter = list.dequeue(WAKING);
+                debug_assert!(waiter.is_some(), "if we were in the `WAITING` state, there must be a waiter in the queue!\nself={:#?}", self);
+
+                // If we popped the last node, transition back to the empty
+                // state.
+                if test_dbg!(list.is_empty()) {
+                    self.state.store(EMPTY, SeqCst);
+                }
+
+                // drop the lock
+                drop(list);
+
+                // wake the waiter
+                if let Some(waiter) = waiter {
+                    waiter.notify();
+                    true
+                } else {
+                    false
+                }
+            }
+            weird => unreachable!("notify_slow: unexpected state value {:?}", weird),
         }
     }
 
     /// Close the queue, notifying all waiting tasks.
     pub(crate) fn close(&self) {
         test_println!("WaitQueue::close()");
-        test_dbg!(self.state.store(CLOSED, Release));
+
+        test_dbg!(self.state.store(CLOSED, SeqCst));
         let mut list = self.list.lock();
-        while let Some(node) = list.pop_back() {
-            node.notify();
+        while !list.is_empty() {
+            if let Some(waiter) = list.dequeue(CLOSED) {
+                waiter.notify();
+            }
         }
     }
 }
@@ -243,49 +438,76 @@ impl<T: Notify + Unpin> WaitQueue<T> {
 impl<T: Notify> Waiter<T> {
     pub(crate) fn new() -> Self {
         Self {
+            state: CachePadded(AtomicUsize::new(EMPTY)),
             node: UnsafeCell::new(Node {
                 next: None,
                 prev: None,
                 waiter: None,
                 _pin: PhantomPinned,
             }),
-            was_woken_from_queue: AtomicBool::new(false),
         }
     }
+
+    #[inline(never)]
+    pub(crate) fn remove(self: Pin<&mut Self>, q: &WaitQueue<T>) {
+        test_println!("Waiter::remove({:p})", self);
+        let mut list = q.list.lock();
+        unsafe {
+            // Safety: removing a node is unsafe even when the list is locked,
+            // because there's no way to guarantee that the node is part of
+            // *this* list. However, the potential callers of this method will
+            // never have access to any other linked lists, so we can just kind
+            // of assume that this is safe.
+            list.remove(self);
+        }
+        if test_dbg!(list.is_empty()) {
+            let _ = test_dbg!(q.state.compare_exchange(WAITING, EMPTY, SeqCst, SeqCst));
+        }
+    }
+
+    #[inline]
+    pub(crate) fn is_linked(&self) -> bool {
+        test_dbg!(self.state.load(Acquire)) == WAITING
+    }
 }
 
 impl<T> Waiter<T> {
-    unsafe fn with_node<U>(&self, f: impl FnOnce(&mut Node<T>) -> U) -> U {
-        self.node.with_mut(|node| f(&mut *node))
+    /// # Safety
+    ///
+    /// This is only safe to call while the list is locked. The dummy `_list`
+    /// parameter ensures this method is only called while holding the lock, so
+    /// this can be safe.
+    #[inline(always)]
+    fn with_node<U>(&self, _list: &mut List<T>, f: impl FnOnce(&mut Node<T>) -> U) -> U {
+        self.node.with_mut(|node| unsafe {
+            // Safety: the dummy `_list` argument ensures that the caller has
+            // the right to mutate the list (e.g. the list is locked).
+            f(&mut *node)
+        })
     }
 
+    /// # Safety
+    ///
+    /// This is only safe to call while the list is locked.
     unsafe fn set_prev(&mut self, prev: Option<NonNull<Waiter<T>>>) {
         self.node.with_mut(|node| (*node).prev = prev);
     }
 
+    /// # Safety
+    ///
+    /// This is only safe to call while the list is locked.
     unsafe fn take_prev(&mut self) -> Option<NonNull<Waiter<T>>> {
         self.node.with_mut(|node| (*node).prev.take())
     }
 
+    /// # Safety
+    ///
+    /// This is only safe to call while the list is locked.
     unsafe fn take_next(&mut self) -> Option<NonNull<Waiter<T>>> {
         self.node.with_mut(|node| (*node).next.take())
     }
 }
 
-impl<T: Notify> Waiter<T> {
-    pub(crate) fn remove(self: Pin<&mut Self>, q: &WaitQueue<T>) {
-        test_println!("Waiter::remove({:p})", self);
-        unsafe {
-            // Safety: removing a node is unsafe even when the list is locked,
-            // because there's no way to guarantee that the node is part of
-            // *this* list. However, the potential callers of this method will
-            // never have access to any other linked lists, so we can just kind
-            // of assume that this is safe.
-            q.list.lock().remove(self);
-        }
-    }
-}
-
 unsafe impl<T: Send> Send for Waiter<T> {}
 unsafe impl<T: Send> Sync for Waiter<T> {}
 
@@ -299,18 +521,24 @@ impl<T> List<T> {
         }
     }
 
-    fn push_front(&mut self, waiter: Pin<&mut Waiter<T>>) {
-        unsafe {
-            waiter.with_node(|node| {
-                node.next = self.head;
-                node.prev = None;
-            })
-        }
+    fn enqueue(&mut self, waiter: Pin<&mut Waiter<T>>) {
+        test_println!("List::enqueue({:p})", waiter);
+
+        let node = unsafe { Pin::into_inner_unchecked(waiter) };
+        let ptr = NonNull::from(&*node);
+        debug_assert_ne!(
+            self.head,
+            Some(ptr),
+            "tried to enqueue the same waiter twice!"
+        );
 
-        let ptr = unsafe { NonNull::from(Pin::into_inner_unchecked(waiter)) };
+        let head = self.head.replace(ptr);
+        node.with_node(self, |node| {
+            node.next = head;
+            node.prev = None;
+        });
 
-        debug_assert_ne!(self.head, Some(ptr), "tried to push the same waiter twice!");
-        if let Some(mut head) = self.head.replace(ptr) {
+        if let Some(mut head) = head {
             unsafe {
                 head.as_mut().set_prev(Some(ptr));
             }
@@ -321,36 +549,41 @@ impl<T> List<T> {
         }
     }
 
-    fn pop_back(&mut self) -> Option<T> {
+    fn dequeue(&mut self, new_state: usize) -> Option<T> {
         let mut last = self.tail?;
-        test_println!("List::pop_back() -> {:p}", last);
-
-        unsafe {
-            let last = last.as_mut();
-            let prev = last.take_prev();
+        test_println!("List::dequeue({:?}) -> {:p}", new_state, last);
+
+        let last = unsafe { last.as_mut() };
+        let _prev_state = test_dbg!(last.state.swap(new_state, Release));
+        debug_assert_eq!(_prev_state, WAITING);
+
+        let (prev, waiter) = last.with_node(self, |node| {
+            node.next = None;
+            (node.prev.take(), node.waiter.take())
+        });
+
+        match prev {
+            Some(mut prev) => unsafe {
+                let _ = prev.as_mut().take_next();
+            },
+            None => self.head = None,
+        }
 
-            match prev {
-                Some(mut prev) => {
-                    let _ = prev.as_mut().take_next();
-                }
-                None => self.head = None,
-            }
+        self.tail = prev;
 
-            self.tail = prev;
-            last.take_next();
-            last.was_woken_from_queue.store(true, Relaxed);
-            last.with_node(|node| node.waiter.take())
-        }
+        waiter
     }
 
     unsafe fn remove(&mut self, node: Pin<&mut Waiter<T>>) {
+        test_println!("List::remove({:p})", node);
+
         let node_ref = node.get_unchecked_mut();
         let prev = node_ref.take_prev();
         let next = node_ref.take_next();
         let ptr = NonNull::from(node_ref);
 
         if let Some(mut prev) = prev {
-            prev.as_mut().with_node(|prev| {
+            prev.as_mut().with_node(self, |prev| {
                 debug_assert_eq!(prev.next, Some(ptr));
                 prev.next = next;
             });
@@ -359,7 +592,7 @@ impl<T> List<T> {
         }
 
         if let Some(mut next) = next {
-            next.as_mut().with_node(|next| {
+            next.as_mut().with_node(self, |next| {
                 debug_assert_eq!(next.prev, Some(ptr));
                 next.prev = prev;
             });
@@ -367,6 +600,10 @@ impl<T> List<T> {
             self.tail = prev;
         }
     }
+
+    fn is_empty(&self) -> bool {
+        self.head == None && self.tail == None
+    }
 }
 
 impl<T> fmt::Debug for List<T> {
@@ -374,6 +611,7 @@ impl<T> fmt::Debug for List<T> {
         f.debug_struct("List")
             .field("head", &self.head)
             .field("tail", &self.tail)
+            .field("is_empty", &self.is_empty())
             .finish()
     }
 }