TCLocks Note
coconutnut

paper repo

Repo structure

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
.
├── LICENSE
├── README.md
├── scripts
│   ├── run-*-benchmark.sh // run benchmarks(host)
│   └── run-vm.sh // run VM(host)
└── src
├── defaults.sh // configure VM cores
├── benchmarks
├── kernel
│   ├── linux-5.14.16
   │   └── rcuht
│   ├── rcuht.c // lock interface
│   ├── run-rcuht-*.sh // run rcuhashbash test(guest, called by benchmarks)
│   ├── include
│   │   ├── lib
│   │   ├── mutex
│   │   ├── rwlock
│   │   ├── rwsem
│   │   └── spinlock
│   ├── lib
│   ├── locks // lock implementation
│   └── script
└── userspace

TCLock implementation

Listing 1 in paper

code: komb.h, komb.c

spin_lock

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
komb_spin_lock(struct qspinlock *lock)
{
u32 val, cnt;
struct komb_node *curr_node = NULL;

// Try to acquire TAS lock
val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);
// expected value: 0, new value: _Q_LOCKED_VAL
// if &lock->val is 0: &lock->val = 0, val = 0
// if &lock->val is not 0: val = &lock->val
if (val == 0)
// Got the lock, return directly, then execute critical setion
return;

curr_node = this_cpu_ptr(&komb_nodes[0]);
// Begin slow path function
komb_spin_lock_slowpath(lock); // Switch stack in this function
}

switch_stack

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
komb_spin_lock_slowpath(struct qspinlock *lock)
{
register int ret_val;

/**
* 26 switch_to_ephemeral_stack(cst.node) # Main → ephemeral stack
**/

// Inline assembly
// push register values to stack
asm volatile("pushq %%rbp\n"
"pushq %%rbx\n"
"pushq %%r12\n"
"pushq %%r13\n"
"pushq %%r14\n"
"pushq %%r15\n"
:
:
// indicate this inline assembly may affect memory
: "memory");
asm volatile(
// call function get_komb_node
// %P0 is a placeholder for the first input operation
// will be replaced by the address of funtion get_komb_node
"callq %P0\n"
// move the value of stack pointer 'rsp' to '%c1(%%rax)'
"movq %%rsp, %c1(%%rax)\n"
:
// indicate the value of input parameter %P0, %c1
: "i"(get_komb_node), "i"(offsetof(struct komb_node, rsp))
: "memory");
asm volatile(
// call function get_shadow_stack_ptr
"callq %P0\n"
// use the value in 'rax' as address, get a value, and save to 'rsp'
"movq (%%rax), %%rsp\n"
:
: "i"(get_shadow_stack_ptr)
: "memory");

/**
* 27 lock_slowpath(lock, cst)
**/
ret_val = __komb_spin_lock_slowpath(lock);

/**
* 28 switch_from_ephemeral_stack(cst.node) # Ephemeral → main stack
**/
if (ret_val) { // TODO: what is ret_val?
asm volatile("callq %P0\n"
"movq (%%rax), %%rsp\n"
"popq %%r15\n"
"popq %%r14\n"
"popq %%r13\n"
"popq %%r12\n"
"popq %%rbx\n"
"popq %%rbp\n"
"retq\n"
:
: "i"(get_shadow_stack_ptr)
: "memory");
} else {
asm volatile("callq %P0\n"
"movq %%rsp, (%%rax)\n"
:
: "i"(get_shadow_stack_ptr)
: "memory");
asm volatile("callq %P0\n"
"movq %c1(%%rax), %%rsp\n"
:
: "i"(get_komb_node),
"i"(offsetof(struct komb_node, rsp))
: "memory");
asm volatile("popq %%r15\n"
"popq %%r14\n"
"popq %%r13\n"
"popq %%r12\n"
"popq %%rbx\n"
"popq %%rbp\n"
"retq\n"
:
:
: "memory");
}
}

lock_slowpath

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
__komb_spin_lock_slowpath(struct qspinlock *lock)
{
struct komb_node *curr_node;
int tail, idx;

// Get qnode
curr_node = this_cpu_ptr(&komb_nodes[0]);
idx = curr_node->count++;
tail = encode_tail(smp_processor_id(), idx);

// Initialize waiter's qnode
curr_node->locked = true;
curr_node->completed = false;
curr_node->next = NULL;
curr_node->tail = tail;
curr_node->socket_id = numa_node_id();
curr_node->cpuid = smp_processor_id();
curr_node->irqs_disabled = false;
curr_node->lock = lock;
curr_node->task_struct_ptr = current;

return __komb_spin_lock_longjmp(lock, tail, curr_node);
}

Phase 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
__komb_spin_lock_longjmp(struct qspinlock *lock, int tail,
register struct komb_node *curr_node)
{
register struct komb_node *prev_node = NULL, *next_node = NULL;
struct qspinlock *parent_lock;
int old_tail, val, j;
uint32_t prev_cs_cpu;
bool prev_locked_val;
uint64_t prev_rsp;
uint64_t prev_counter_val;
struct komb_node *prev_next_node_ptr = NULL;

/**
* 36 qprev = SWAP(&lock.tail, &qnode)
**/
// Add node to tail
old_tail = xchg_tail(lock, tail);

// If queue is not empty
if (old_tail & _Q_TAIL_MASK) {
// Add qnode to tail
prev_node = decode_tail(old_tail);
prev_node->next = curr_node;

/**
* 39 while qnode.wait is True:
**/
// Wait on qnode.wait
smp_cond_load_relaxed_sched(&curr_node->locked, !(VAL));

struct shadow_stack *ptr = this_cpu_ptr(&local_shadow_stack);

/**
* 41 if qnode.request == PRCSD :
**/
// Check if request has been processed (critical section executed by combiner)
if (curr_node->completed) {
return 0;
}
}

// Phase 2
...

// Phase 3
...
}

Phase 2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
// Now waiter is at the head of the queue

/*
* we're at the head of the waitqueue, wait for the owner & pending to
* go away.
*
* *,x,y -> *,0,0
*
* this wait loop must use a load-acquire such that we match the
* store-release that clears the locked bit and create lock
* sequentiality; this is because the set_locked() function below
* does not imply a full barrier.
*
*/
// Want to read &lock->val
// !(VAL & _Q_LOCKED_PENDING_MASK) is a condition
// when condition is true, read the value and save it to val
val = atomic_cond_read_acquire(&lock->val,
!(VAL & _Q_LOCKED_PENDING_MASK));

TODO: load acquire?

Phase 3

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/*
* claim the lock:
*
* n,0,0 -> 0,0,1 : lock, uncontended
* *,*,0 -> *,*,1 : lock, contended
*
* If the queue head is the only one in the queue (lock value == tail)
* and nobody is pending, clear the tail code and grab the lock.
* Otherwise, we only need to grab the lock.
*/

/**
* 52 if CAS(&lock.tail, qnode, None) == qnode:
* 53 return # If only one in the queue, return
**/
if (((val & _Q_TAIL_MASK) == tail) &&
atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
goto release; /* No contention */

/* Either somebody is queued behind us or _Q_PENDING_VAL is set */
// Declare combining phase (line 63 in pesudocode)
check_and_set_combiner(lock);

/*
* contended path; wait for next if not observed yet, release.
*/

/**
* 54 qnext = qnode.next
* 55 while qnext is None:
* 56 qnext = qnode.next
**/
// Wait until qnext is get
smp_cond_load_relaxed_sched(&curr_node->next, (VAL));
next_node = curr_node->next;

run_combiner(lock, next_node);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
run_combiner(struct qspinlock *lock, struct komb_node *curr_node)
{
struct komb_node *next_node = curr_node->next;
int counter = 0;

/**
* 58 if qnext.next == None:
* 59 notify_next_queue_head(qnext) # next waiter is combiner
* 60 return
**/
// <2 nodes in the queue, return
if (next_node == NULL) {
set_locked(lock);
/*
* Make this node spin on the locked variable and then it will
* become the combiner.
*/
curr_node->locked = false;
return;
}

// Phase 4
...
}

Phase 4

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
struct shadow_stack *ptr = this_cpu_ptr(&local_shadow_stack);

// Declare combining phase already done in Phase 3

/**
* 65 while True: # Combiner loop
**/
while (curr_node) {
counter++;
next_node = get_next_node(curr_node);
execute_cs(lock, curr_node);
clear_locked_set_completed(curr_node);
if (next_node == NULL || next_node->next == NULL ||
counter >= komb_batch_size)
break;
curr_node = next_node;
}

// Combining phase over, set lock, run its own critical section
set_locked(lock);

/*
* Make this node spin on the locked variable and then it will become
* the combiner.
*/
next_node->locked = false;

spin_unlock

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
komb_spin_unlock(struct qspinlock *lock)
{
// Not in combining phase, unlock and return
if (lock->locked == _Q_LOCKED_VAL ||
lock->locked == _Q_LOCKED_IRQ_VAL) {
lock->locked = false;
return;
}

struct shadow_stack *ptr = this_cpu_ptr(&local_shadow_stack);
struct komb_node *curr_node = per_cpu_ptr(&komb_nodes[0], from_cpuid);
incoming_rsp_ptr = &(ptr->local_shadow_stack_ptr);
outgoing_rsp_ptr = &(curr_node->rsp);

// Jumo back to combiner
komb_context_switch(incoming_rsp_ptr, outgoing_rsp_ptr);

return;
}

Global lock

Lock struct

Top level lock, MCS queue

1
2
3
4
5
6
7
8
9
10
11
12
13
struct orig_qspinlock {
union {
atomic_t val;
struct {
u8 locked;
u8 pending;
};
struct {
u16 locked_pending;
u16 tail;
};
}
}

Acquire

In spin_lock()

1
2
3
val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);
if (val == 0)
return;

Fastpath: is val == 0 , get the lock

In phase 2

Now current node is notified by the previous node

Its CS is not executed

Wants to aquire global lock

1
2
3
4
5
6
7
8
9
10
11
12
13
14
/*
* we're at the head of the waitqueue, wait for the owner & pending to
* go away.
*
* *,x,y -> *,0,0
*
* this wait loop must use a load-acquire such that we match the
* store-release that clears the locked bit and create lock
* sequentiality; this is because the set_locked() function below
* does not imply a full barrier.
*
*/

val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));

Since

1
#define atomic_cond_read_acquire(v, c) smp_cond_load_acquire(&(v)->counter, (c))

This actucally calls smp_cond_load_acquire()

-> refer to: https://www.kernel.org/doc/html/latest/core-api/wrappers/memory-barriers.html

When lock is not pending, read lock value, add ACQUIRE barrier?

1
2
3
4
5
6
7
8
9
10
/**
* set_locked - Set the lock bit and own the lock
* @lock: Pointer to queued spinlock structure
*
* *,*,0 -> *,0,1
*/
static __always_inline void set_locked(struct qspinlock *lock)
{
WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
}

…reading https://deepdives.medium.com/kernel-locking-deep-dive-into-spinlocks-part-1-bcdc46ee8df6