Repo structure

.
├── LICENSE
├── README.md
├── scripts                           
│   ├── run-*-benchmark.sh         // run benchmarks(host)
│   └── run-vm.sh                  // run VM(host)
└── src
    ├── defaults.sh                // configure VM cores
    ├── benchmarks
    ├── kernel
    │   ├── linux-5.14.16
    │   └── rcuht
    │       ├── rcuht.c            // lock interface
    │       ├── run-rcuht-*.sh     // run rcuhashbash test(guest, called by benchmarks)
    │       ├── include
    │       │   ├── lib
    │       │   ├── mutex
    │       │   ├── rwlock
    │       │   ├── rwsem
    │       │   └── spinlock
    │       ├── lib
    │       ├── locks              // lock implementation
    │       └── script
    └── userspace

TCLock implementation

Listing 1 in paper

code: komb.h, komb.c

spin_lock

komb_spin_lock(struct qspinlock *lock)
{
	u32 val, cnt;
	struct komb_node *curr_node = NULL;

  // Try to acquire TAS lock
	val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);
  //                                    expected value: 0, new value: _Q_LOCKED_VAL
  // if &lock->val is 0: &lock->val = 0, val = 0
  // if &lock->val is not 0:             val = &lock->val
	if (val == 0)
    // Got the lock, return directly, then execute critical setion
		return;
  
  curr_node = this_cpu_ptr(&komb_nodes[0]);
  // Begin slow path function
  komb_spin_lock_slowpath(lock); // Switch stack in this function
}

switch_stack

komb_spin_lock_slowpath(struct qspinlock *lock)
{
	register int ret_val;
	
  /**
  * 26 switch_to_ephemeral_stack(cst.node) # Main → ephemeral stack
  **/
  
  // Inline assembly
  // push register values to stack
	asm volatile("pushq %%rbp\n"
		     "pushq %%rbx\n"
		     "pushq %%r12\n"
		     "pushq %%r13\n"
		     "pushq %%r14\n"
		     "pushq %%r15\n"
		     :
		     :
         // indicate this inline assembly may affect memory
		     : "memory"); 
	asm volatile(
    		 // call function get_komb_node
         // %P0 is a placeholder for the first input operation
         // will be replaced by the address of funtion get_komb_node
    		 "callq %P0\n"
    		 // move the value of stack pointer 'rsp' to '%c1(%%rax)'
		     "movq %%rsp, %c1(%%rax)\n" 
		     :
         // indicate the value of input parameter %P0, %c1
		     : "i"(get_komb_node), "i"(offsetof(struct komb_node, rsp))
		     : "memory");
	asm volatile(
    		 // call function get_shadow_stack_ptr
    		 "callq %P0\n"
    		 // use the value in 'rax' as address, get a value, and save to 'rsp'
		     "movq (%%rax), %%rsp\n"
		     :
		     : "i"(get_shadow_stack_ptr)
		     : "memory");
  
  /**
  * 27 lock_slowpath(lock, cst)
  **/
  ret_val = __komb_spin_lock_slowpath(lock);
  
  /**
  * 28 switch_from_ephemeral_stack(cst.node) # Ephemeral → main stack
  **/
  if (ret_val) {  // TODO: what is ret_val?
		asm volatile("callq %P0\n"
			     "movq (%%rax), %%rsp\n"
			     "popq %%r15\n"
			     "popq %%r14\n"
			     "popq %%r13\n"
			     "popq %%r12\n"
			     "popq %%rbx\n"
			     "popq %%rbp\n"
			     "retq\n"
			     :
			     : "i"(get_shadow_stack_ptr)
			     : "memory");
	} else {
		asm volatile("callq %P0\n"
			     "movq %%rsp, (%%rax)\n"
			     :
			     : "i"(get_shadow_stack_ptr)
			     : "memory");
		asm volatile("callq %P0\n"
			     "movq %c1(%%rax), %%rsp\n"
			     :
			     : "i"(get_komb_node),
			       "i"(offsetof(struct komb_node, rsp))
			     : "memory");
		asm volatile("popq %%r15\n"
			     "popq %%r14\n"
			     "popq %%r13\n"
			     "popq %%r12\n"
			     "popq %%rbx\n"
			     "popq %%rbp\n"
			     "retq\n"
			     :
			     :
			     : "memory");
	}
}

lock_slowpath

__komb_spin_lock_slowpath(struct qspinlock *lock)
{
	struct komb_node *curr_node;
	int tail, idx;

  // Get qnode
	curr_node = this_cpu_ptr(&komb_nodes[0]);
	idx = curr_node->count++;
	tail = encode_tail(smp_processor_id(), idx);

	// Initialize waiter's qnode
	curr_node->locked = true;
	curr_node->completed = false;
	curr_node->next = NULL;
	curr_node->tail = tail;
	curr_node->socket_id = numa_node_id();
	curr_node->cpuid = smp_processor_id();
	curr_node->irqs_disabled = false;
	curr_node->lock = lock;
	curr_node->task_struct_ptr = current;

	return __komb_spin_lock_longjmp(lock, tail, curr_node);
}

Phase 1

__komb_spin_lock_longjmp(struct qspinlock *lock, int tail,
			 register struct komb_node *curr_node)
{
	register struct komb_node *prev_node = NULL, *next_node = NULL;
	struct qspinlock *parent_lock;
	int old_tail, val, j;
  uint32_t prev_cs_cpu;
	bool prev_locked_val;
	uint64_t prev_rsp;
	uint64_t prev_counter_val;
	struct komb_node *prev_next_node_ptr = NULL;
  
  /**
  * 36 qprev = SWAP(&lock.tail, &qnode) 
  **/
  // Add node to tail
  old_tail = xchg_tail(lock, tail);
  
  // If queue is not empty
	if (old_tail & _Q_TAIL_MASK) {
    // Add qnode to tail
		prev_node = decode_tail(old_tail);
		prev_node->next = curr_node;

    /**
    * 39 while qnode.wait is True:
    **/
		// Wait on qnode.wait
		smp_cond_load_relaxed_sched(&curr_node->locked, !(VAL));

		struct shadow_stack *ptr = this_cpu_ptr(&local_shadow_stack);

    /**
    * 41 if qnode.request == PRCSD :
    **/
    // Check if request has been processed (critical section executed by combiner)
		if (curr_node->completed) {
			return 0;
		}
	}
  
  // Phase 2
  ...
    
  // Phase 3
  ...
}

Phase 2

// Now waiter is at the head of the queue

/*
 * we're at the head of the waitqueue, wait for the owner & pending to
 * go away.
 *
 * *,x,y -> *,0,0
 *
 * this wait loop must use a load-acquire such that we match the
 * store-release that clears the locked bit and create lock
 * sequentiality; this is because the set_locked() function below
 * does not imply a full barrier.
 *
 */
// Want to read &lock->val
// !(VAL & _Q_LOCKED_PENDING_MASK) is a condition
// when condition is true, read the value and save it to val
val = atomic_cond_read_acquire(&lock->val,
             !(VAL & _Q_LOCKED_PENDING_MASK));

TODO: load acquire?

Phase 3

/*
 * claim the lock:
 *
 * n,0,0 -> 0,0,1 : lock, uncontended
 * *,*,0 -> *,*,1 : lock, contended
 *
 * If the queue head is the only one in the queue (lock value == tail)
 * and nobody is pending, clear the tail code and grab the lock.
 * Otherwise, we only need to grab the lock.
 */

/**
* 52 if CAS(&lock.tail, qnode, None) == qnode:
* 53 return # If only one in the queue, return
**/
if (((val & _Q_TAIL_MASK) == tail) &&
    atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
  goto release; /* No contention */

/* Either somebody is queued behind us or _Q_PENDING_VAL is set */
// Declare combining phase (line 63 in pesudocode)
check_and_set_combiner(lock);

/*
 * contended path; wait for next if not observed yet, release.
 */

/**
* 54 qnext = qnode.next
* 55 while qnext is None:
* 56   qnext = qnode.next
**/
// Wait until qnext is get
smp_cond_load_relaxed_sched(&curr_node->next, (VAL));
next_node = curr_node->next;

run_combiner(lock, next_node);

run_combiner(struct qspinlock *lock, struct komb_node *curr_node)
{
	struct komb_node *next_node = curr_node->next;
	int counter = 0;

  /**
  * 58 if qnext.next == None:
		* 59 notify_next_queue_head(qnext) # next waiter is combiner
		* 60 return
  **/
  // <2 nodes in the queue, return
	if (next_node == NULL) {
		set_locked(lock);
		/*
		 * Make this node spin on the locked variable and then it will 
		 * become the combiner.
		 */
		curr_node->locked = false;
		return;
	}

  // Phase 4
  ...
}

Phase 4

struct shadow_stack *ptr = this_cpu_ptr(&local_shadow_stack);

// Declare combining phase already done in Phase 3

/**
* 65 while True: # Combiner loop
**/
while (curr_node) {
  counter++;
  next_node = get_next_node(curr_node);
  execute_cs(lock, curr_node);
  clear_locked_set_completed(curr_node);
  if (next_node == NULL || next_node->next == NULL ||
      counter >= komb_batch_size)
    break;
  curr_node = next_node;
}

// Combining phase over, set lock, run its own critical section
set_locked(lock);

/* 
 * Make this node spin on the locked variable and then it will become 
 * the combiner.
 */
next_node->locked = false;

spin_unlock

komb_spin_unlock(struct qspinlock *lock)
{
  // Not in combining phase, unlock and return
	if (lock->locked == _Q_LOCKED_VAL ||
	    lock->locked == _Q_LOCKED_IRQ_VAL) {
		lock->locked = false;
		return;
	}

  struct shadow_stack *ptr = this_cpu_ptr(&local_shadow_stack);
  struct komb_node *curr_node = per_cpu_ptr(&komb_nodes[0], from_cpuid);
	incoming_rsp_ptr = &(ptr->local_shadow_stack_ptr);
	outgoing_rsp_ptr = &(curr_node->rsp);
  
	// Jumo back to combiner
	komb_context_switch(incoming_rsp_ptr, outgoing_rsp_ptr);

	return;
}

Global lock

Lock struct

Top level lock, MCS queue

struct orig_qspinlock {
		union {
      atomic_t val;
      struct {
        u8 locked;
        u8 pending;
      };
      struct {
        u16 locked_pending;
        u16 tail;
      };
    }
}

Acquire

In spin_lock()

1
2
3

val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);
if (val == 0)
  return;

Fastpath: is val == 0 , get the lock

In phase 2

Now current node is notified by the previous node

Its CS is not executed

Wants to aquire global lock

/*
 * we're at the head of the waitqueue, wait for the owner & pending to
 * go away.
 *
 * *,x,y -> *,0,0
 *
 * this wait loop must use a load-acquire such that we match the
 * store-release that clears the locked bit and create lock
 * sequentiality; this is because the set_locked() function below
 * does not imply a full barrier.
 *
 */

val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));

Since

1	#define atomic_cond_read_acquire(v, c) smp_cond_load_acquire(&(v)->counter, (c))

This actucally calls smp_cond_load_acquire()

-> refer to: https://www.kernel.org/doc/html/latest/core-api/wrappers/memory-barriers.html

When lock is not pending, read lock value, add ACQUIRE barrier?

/**
 * set_locked - Set the lock bit and own the lock
 * @lock: Pointer to queued spinlock structure
 *
 * *,*,0 -> *,0,1
 */
static __always_inline void set_locked(struct qspinlock *lock)
{
	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
}

…reading https://deepdives.medium.com/kernel-locking-deep-dive-into-spinlocks-part-1-bcdc46ee8df6