In this document I will explain what is going on in the machine code generated by the Fil-C compiler for the insert_sorted
function in this simple program:
#include <stdlib.h>
#include <stdio.h>
struct list {
struct list* next;
int value;
};
__attribute__((noinline)) struct list* insert_sorted(struct list** list_head, int value)
{
struct list* node = (struct list*)malloc(sizeof(struct list));
node->value = value;
struct list** next_ptr;
for (next_ptr = list_head; ; next_ptr = &(*next_ptr)->next) {
if (!*next_ptr || (*next_ptr)->value > value) {
node->next = *next_ptr;
*next_ptr = node;
return node;
}
}
}
int main()
{
struct list* list = NULL;
unsigned i;
unsigned value = 42;
for (i = 100; i--;) {
value ^= value << 13;
value ^= value >> 17;
value ^= value << 5;
insert_sorted(&list, value);
}
for (; list; list = list->next)
printf("%d\n", list->value);
return 0;
}
The goal of this document is to explain to you how Fil-C works today. Since it's such a young implementation, Fil-C has lots of optimization opportunities. This document links to GH issues for all of the optimization opportunities it identifies.
I compiled this program with my build of the Fil-C compiler (revision 61e304b42d01e3fc132dac802fe3f4b2c299653d, not for off from Fil-C release 0.668.3):
build/bin/clang -o test43 test43.c -O2 -g
And then dumped the disassembly with:
objdump -d test43
Here's the full disassembly of insert_sorted
. Don't worry if you don't get it immediately; I'll explain it instruction by instruction.
0000000000002250 <pizlonated_insert_sorted>:
2250: 48 8d 05 09 00 00 00 lea 0x9(%rip),%rax # 2260 <Jf_insert_sorted>
2257: 48 8d 15 ca 29 00 00 lea 0x29ca(%rip),%rdx # 4c28 <Jfo_insert_sorted+0x10>
225e: c3 ret
0000000000002260 <Jf_insert_sorted>:
2260: 55 push %rbp
2261: 41 57 push %r15
2263: 41 56 push %r14
2265: 41 55 push %r13
2267: 41 54 push %r12
2269: 53 push %rbx
226a: 48 83 ec 58 sub $0x58,%rsp
226e: 48 39 27 cmp %rsp,(%rdi)
2271: 0f 83 e9 fd ff ff jae 2060 <filc_stack_overflow_failure@plt>
2277: 48 8b 4f 10 mov 0x10(%rdi),%rcx
227b: 48 89 4c 24 28 mov %rcx,0x28(%rsp)
2280: 48 8d 4c 24 28 lea 0x28(%rsp),%rcx
2285: 48 89 4f 10 mov %rcx,0x10(%rdi)
2289: 48 8d 0d d8 27 00 00 lea 0x27d8(%rip),%rcx # 4a68 <Jgo_.str+0x58>
2290: 48 89 4c 24 30 mov %rcx,0x30(%rsp)
2295: 0f 57 c0 xorps %xmm0,%xmm0
2298: 0f 11 44 24 38 movups %xmm0,0x38(%rsp)
229d: 0f 11 44 24 48 movups %xmm0,0x48(%rsp)
22a2: 48 83 fe 0f cmp $0xf,%rsi
22a6: 0f 86 7e 04 00 00 jbe 272a <Jf_insert_sorted+0x4ca>
22ac: 48 89 fb mov %rdi,%rbx
22af: 4c 8b af 80 00 00 00 mov 0x80(%rdi),%r13
22b6: 8b 87 88 00 00 00 mov 0x88(%rdi),%eax
22bc: 89 44 24 08 mov %eax,0x8(%rsp)
22c0: 4c 8b a7 80 01 00 00 mov 0x180(%rdi),%r12
22c7: 48 8d 35 02 29 00 00 lea 0x2902(%rip),%rsi # 4bd0 <Jgo_.str+0x1c0>
22ce: e8 ed fd ff ff call 20c0 <pizlonated_malloc@plt>
22d3: 48 c7 83 80 00 00 00 movq $0x10,0x80(%rbx)
22da: 10 00 00 00
22de: 48 c7 83 80 01 00 00 movq $0x0,0x180(%rbx)
22e5: 00 00 00 00
22e9: 48 8d 0d 18 29 00 00 lea 0x2918(%rip),%rcx # 4c08 <Jgo_.str+0x1f8>
22f0: 48 89 4c 24 30 mov %rcx,0x30(%rsp)
22f5: 48 85 d2 test %rdx,%rdx
22f8: 0f 84 21 04 00 00 je 271f <Jf_insert_sorted+0x4bf>
22fe: 48 8b 4a f8 mov -0x8(%rdx),%rcx
2302: 48 be 00 00 00 00 00 movabs $0x3c0000000000000,%rsi
2309: 00 c0 03
230c: 48 21 ce and %rcx,%rsi
230f: 48 bf 00 00 00 00 00 movabs $0x40000000000000,%rdi
2316: 00 40 00
2319: 48 39 fe cmp %rdi,%rsi
231c: 0f 85 fd 03 00 00 jne 271f <Jf_insert_sorted+0x4bf>
2322: 48 be ff ff ff ff ff movabs $0xffffffffffff,%rsi
2329: ff 00 00
232c: 48 21 f1 and %rsi,%rcx
232f: 48 39 c8 cmp %rcx,%rax
2332: 0f 85 e7 03 00 00 jne 271f <Jf_insert_sorted+0x4bf>
2338: be 08 00 00 00 mov $0x8,%esi
233d: 48 89 df mov %rbx,%rdi
2340: ff d0 call *%rax
2342: a8 01 test $0x1,%al
2344: 0f 85 00 03 00 00 jne 264a <Jf_insert_sorted+0x3ea>
234a: 48 83 fa 07 cmp $0x7,%rdx
234e: 0f 86 ed 03 00 00 jbe 2741 <Jf_insert_sorted+0x4e1>
2354: 48 8b bb 80 00 00 00 mov 0x80(%rbx),%rdi
235b: 48 8b b3 80 01 00 00 mov 0x180(%rbx),%rsi
2362: 48 89 74 24 38 mov %rsi,0x38(%rsp)
2367: 48 85 f6 test %rsi,%rsi
236a: 44 8b 54 24 08 mov 0x8(%rsp),%r10d
236f: 0f 84 b9 02 00 00 je 262e <Jf_insert_sorted+0x3ce>
2375: f6 46 fe 06 testb $0x6,-0x2(%rsi)
2379: 0f 85 af 02 00 00 jne 262e <Jf_insert_sorted+0x3ce>
237f: 48 8d 47 08 lea 0x8(%rdi),%rax
2383: 48 39 f0 cmp %rsi,%rax
2386: 0f 82 a2 02 00 00 jb 262e <Jf_insert_sorted+0x3ce>
238c: 48 8b 4e f0 mov -0x10(%rsi),%rcx
2390: 48 83 c1 fc add $0xfffffffffffffffc,%rcx
2394: 48 39 c8 cmp %rcx,%rax
2397: 0f 87 91 02 00 00 ja 262e <Jf_insert_sorted+0x3ce>
239d: 44 89 10 mov %r10d,(%rax)
23a0: 4c 89 64 24 40 mov %r12,0x40(%rsp)
23a5: 4d 85 e4 test %r12,%r12
23a8: 0f 84 55 02 00 00 je 2603 <Jf_insert_sorted+0x3a3>
23ae: 49 89 f8 mov %rdi,%r8
23b1: 4c 8d 4e f0 lea -0x10(%rsi),%r9
23b5: 4c 8d 1d 9c 26 00 00 lea 0x269c(%rip),%r11 # 4a58 <Jgo_.str+0x48>
23bc: 48 ba ff ff ff ff ff movabs $0xffffffffffff,%rdx
23c3: ff 00 00
23c6: 41 f6 c5 07 test $0x7,%r13b
23ca: 0f 85 33 02 00 00 jne 2603 <Jf_insert_sorted+0x3a3>
23d0: 4c 89 ed mov %r13,%rbp
23d3: 4c 29 e5 sub %r12,%rbp
23d6: 0f 82 27 02 00 00 jb 2603 <Jf_insert_sorted+0x3a3>
23dc: 4d 3b 6c 24 f0 cmp -0x10(%r12),%r13
23e1: 0f 83 1c 02 00 00 jae 2603 <Jf_insert_sorted+0x3a3>
23e7: 49 8b 44 24 f8 mov -0x8(%r12),%rax
23ec: 48 21 d0 and %rdx,%rax
23ef: 0f 84 ba 00 00 00 je 24af <Jf_insert_sorted+0x24f>
23f5: 4c 8b 3c 28 mov (%rax,%rbp,1),%r15
23f9: 41 f6 c7 01 test $0x1,%r15b
23fd: 75 67 jne 2466 <Jf_insert_sorted+0x206>
23ff: 4d 8b 75 00 mov 0x0(%r13),%r14
2403: 4c 89 7c 24 48 mov %r15,0x48(%rsp)
2408: 4d 85 f6 test %r14,%r14
240b: 0f 84 bd 00 00 00 je 24ce <Jf_insert_sorted+0x26e>
2411: 4d 85 ff test %r15,%r15
2414: 0f 84 fb 01 00 00 je 2615 <Jf_insert_sorted+0x3b5>
241a: 49 8d 46 08 lea 0x8(%r14),%rax
241e: 4c 39 f8 cmp %r15,%rax
2421: 0f 82 ee 01 00 00 jb 2615 <Jf_insert_sorted+0x3b5>
2427: 49 8b 4f f0 mov -0x10(%r15),%rcx
242b: 48 83 c1 fc add $0xfffffffffffffffc,%rcx
242f: 48 39 c8 cmp %rcx,%rax
2432: 0f 87 dd 01 00 00 ja 2615 <Jf_insert_sorted+0x3b5>
2438: 4c 89 7c 24 50 mov %r15,0x50(%rsp)
243d: 44 39 10 cmp %r10d,(%rax)
2440: 0f 8f 2f 01 00 00 jg 2575 <Jf_insert_sorted+0x315>
2446: f6 43 08 0e testb $0xe,0x8(%rbx)
244a: 75 24 jne 2470 <Jf_insert_sorted+0x210>
244c: 4c 89 7c 24 40 mov %r15,0x40(%rsp)
2451: 4d 89 f5 mov %r14,%r13
2454: 4d 89 fc mov %r15,%r12
2457: 41 f6 c5 07 test $0x7,%r13b
245b: 0f 84 6f ff ff ff je 23d0 <Jf_insert_sorted+0x170>
2461: e9 9d 01 00 00 jmp 2603 <Jf_insert_sorted+0x3a3>
2466: 49 83 e7 fe and $0xfffffffffffffffe,%r15
246a: 4d 8b 7f 08 mov 0x8(%r15),%r15
246e: eb 8f jmp 23ff <Jf_insert_sorted+0x19f>
2470: 48 89 7c 24 10 mov %rdi,0x10(%rsp)
2475: 48 89 df mov %rbx,%rdi
2478: 49 89 f4 mov %rsi,%r12
247b: 4c 89 de mov %r11,%rsi
247e: 4c 89 c5 mov %r8,%rbp
2481: 4d 89 cd mov %r9,%r13
2484: e8 97 fb ff ff call 2020 <filc_pollcheck_slow@plt>
2489: 4c 8d 1d c8 25 00 00 lea 0x25c8(%rip),%r11 # 4a58 <Jgo_.str+0x48>
2490: 44 8b 54 24 08 mov 0x8(%rsp),%r10d
2495: 4d 89 e9 mov %r13,%r9
2498: 49 89 e8 mov %rbp,%r8
249b: 48 8b 7c 24 10 mov 0x10(%rsp),%rdi
24a0: 48 ba ff ff ff ff ff movabs $0xffffffffffff,%rdx
24a7: ff 00 00
24aa: 4c 89 e6 mov %r12,%rsi
24ad: eb 9d jmp 244c <Jf_insert_sorted+0x1ec>
24af: 4d 8b 75 00 mov 0x0(%r13),%r14
24b3: 48 c7 44 24 48 00 00 movq $0x0,0x48(%rsp)
24ba: 00 00
24bc: 31 c9 xor %ecx,%ecx
24be: 41 bf 00 00 00 00 mov $0x0,%r15d
24c4: 4d 85 f6 test %r14,%r14
24c7: 74 08 je 24d1 <Jf_insert_sorted+0x271>
24c9: e9 4a 01 00 00 jmp 2618 <Jf_insert_sorted+0x3b8>
24ce: 4c 89 f9 mov %r15,%rcx
24d1: 41 f6 c0 07 test $0x7,%r8b
24d5: 0f 85 63 01 00 00 jne 263e <Jf_insert_sorted+0x3de>
24db: 49 39 f0 cmp %rsi,%r8
24de: 0f 82 5a 01 00 00 jb 263e <Jf_insert_sorted+0x3de>
24e4: 48 8b 46 f8 mov -0x8(%rsi),%rax
24e8: 48 0f ba e0 32 bt $0x32,%rax
24ed: 0f 82 4b 01 00 00 jb 263e <Jf_insert_sorted+0x3de>
24f3: 41 f6 44 24 fe 02 testb $0x2,-0x2(%r12)
24f9: 0f 85 b9 01 00 00 jne 26b8 <Jf_insert_sorted+0x458>
24ff: 45 31 f6 xor %r14d,%r14d
2502: 48 21 d0 and %rdx,%rax
2505: 0f 84 aa 00 00 00 je 25b5 <Jf_insert_sorted+0x355>
250b: 48 29 f7 sub %rsi,%rdi
250e: 4c 8b 0d cb 2a 00 00 mov 0x2acb(%rip),%r9 # 4fe0 <filc_is_marking>
2515: 4d 85 ff test %r15,%r15
2518: 74 0a je 2524 <Jf_insert_sorted+0x2c4>
251a: 41 80 39 00 cmpb $0x0,(%r9)
251e: 0f 85 a0 01 00 00 jne 26c4 <Jf_insert_sorted+0x464>
2524: 48 89 0c 38 mov %rcx,(%rax,%rdi,1)
2528: 4d 89 30 mov %r14,(%r8)
252b: 49 23 54 24 f8 and -0x8(%r12),%rdx
2530: 0f 84 1d 01 00 00 je 2653 <Jf_insert_sorted+0x3f3>
2536: 41 80 39 00 cmpb $0x0,(%r9)
253a: 0f 85 4d 01 00 00 jne 268d <Jf_insert_sorted+0x42d>
2540: 48 89 34 2a mov %rsi,(%rdx,%rbp,1)
2544: 4d 89 45 00 mov %r8,0x0(%r13)
2548: 4c 89 83 80 00 00 00 mov %r8,0x80(%rbx)
254f: 48 89 b3 80 01 00 00 mov %rsi,0x180(%rbx)
2556: ba 08 00 00 00 mov $0x8,%edx
255b: 31 c0 xor %eax,%eax
255d: 48 8b 4c 24 28 mov 0x28(%rsp),%rcx
2562: 48 89 4b 10 mov %rcx,0x10(%rbx)
2566: 48 83 c4 58 add $0x58,%rsp
256a: 5b pop %rbx
256b: 41 5c pop %r12
256d: 41 5d pop %r13
256f: 41 5e pop %r14
2571: 41 5f pop %r15
2573: 5d pop %rbp
2574: c3 ret
2575: 41 f6 c0 07 test $0x7,%r8b
2579: 0f 85 2d 01 00 00 jne 26ac <Jf_insert_sorted+0x44c>
257f: 49 39 f0 cmp %rsi,%r8
2582: 0f 82 24 01 00 00 jb 26ac <Jf_insert_sorted+0x44c>
2588: 48 8b 46 f8 mov -0x8(%rsi),%rax
258c: 48 0f ba e0 32 bt $0x32,%rax
2591: 0f 82 15 01 00 00 jb 26ac <Jf_insert_sorted+0x44c>
2597: 41 f6 44 24 fe 02 testb $0x2,-0x2(%r12)
259d: 0f 85 70 01 00 00 jne 2713 <Jf_insert_sorted+0x4b3>
25a3: 4c 89 f9 mov %r15,%rcx
25a6: 41 bf 01 00 00 00 mov $0x1,%r15d
25ac: 48 21 d0 and %rdx,%rax
25af: 0f 85 56 ff ff ff jne 250b <Jf_insert_sorted+0x2ab>
25b5: 48 89 4c 24 18 mov %rcx,0x18(%rsp)
25ba: 48 8d 05 2f 25 00 00 lea 0x252f(%rip),%rax # 4af0 <Jgo_.str+0xe0>
25c1: 48 89 44 24 30 mov %rax,0x30(%rsp)
25c6: 48 89 7c 24 10 mov %rdi,0x10(%rsp)
25cb: 48 89 df mov %rbx,%rdi
25ce: 48 89 74 24 08 mov %rsi,0x8(%rsp)
25d3: 4c 89 ce mov %r9,%rsi
25d6: 4c 89 44 24 20 mov %r8,0x20(%rsp)
25db: e8 20 fb ff ff call 2100 <filc_object_ensure_aux_ptr_outline@plt>
25e0: 4c 8b 44 24 20 mov 0x20(%rsp),%r8
25e5: 48 8b 7c 24 10 mov 0x10(%rsp),%rdi
25ea: 48 ba ff ff ff ff ff movabs $0xffffffffffff,%rdx
25f1: ff 00 00
25f4: 48 8b 74 24 08 mov 0x8(%rsp),%rsi
25f9: 48 8b 4c 24 18 mov 0x18(%rsp),%rcx
25fe: e9 08 ff ff ff jmp 250b <Jf_insert_sorted+0x2ab>
2603: 48 8d 15 c6 24 00 00 lea 0x24c6(%rip),%rdx # 4ad0 <Jgo_.str+0xc0>
260a: 4c 89 ef mov %r13,%rdi
260d: 4c 89 e6 mov %r12,%rsi
2610: e8 db fa ff ff call 20f0 <filc_optimized_access_check_fail@plt>
2615: 4c 89 f9 mov %r15,%rcx
2618: 49 83 c6 08 add $0x8,%r14
261c: 48 8d 15 4d 25 00 00 lea 0x254d(%rip),%rdx # 4b70 <Jgo_.str+0x160>
2623: 4c 89 f7 mov %r14,%rdi
2626: 48 89 ce mov %rcx,%rsi
2629: e8 c2 fa ff ff call 20f0 <filc_optimized_access_check_fail@plt>
262e: 48 83 c7 08 add $0x8,%rdi
2632: 48 8d 15 67 24 00 00 lea 0x2467(%rip),%rdx # 4aa0 <Jgo_.str+0x90>
2639: e8 b2 fa ff ff call 20f0 <filc_optimized_access_check_fail@plt>
263e: 48 8d 15 cb 24 00 00 lea 0x24cb(%rip),%rdx # 4b10 <Jgo_.str+0x100>
2645: e8 a6 fa ff ff call 20f0 <filc_optimized_access_check_fail@plt>
264a: b0 01 mov $0x1,%al
264c: 31 d2 xor %edx,%edx
264e: e9 0a ff ff ff jmp 255d <Jf_insert_sorted+0x2fd>
2653: 49 83 c4 f0 add $0xfffffffffffffff0,%r12
2657: 48 8d 05 d2 24 00 00 lea 0x24d2(%rip),%rax # 4b30 <Jgo_.str+0x120>
265e: 48 89 44 24 30 mov %rax,0x30(%rsp)
2663: 48 89 df mov %rbx,%rdi
2666: 49 89 f6 mov %rsi,%r14
2669: 4c 89 e6 mov %r12,%rsi
266c: 4d 89 c7 mov %r8,%r15
266f: 4d 89 cc mov %r9,%r12
2672: e8 89 fa ff ff call 2100 <filc_object_ensure_aux_ptr_outline@plt>
2677: 4d 89 e1 mov %r12,%r9
267a: 4d 89 f8 mov %r15,%r8
267d: 4c 89 f6 mov %r14,%rsi
2680: 48 89 c2 mov %rax,%rdx
2683: 41 80 39 00 cmpb $0x0,(%r9)
2687: 0f 84 b3 fe ff ff je 2540 <Jf_insert_sorted+0x2e0>
268d: 48 89 df mov %rbx,%rdi
2690: 49 89 f6 mov %rsi,%r14
2693: 49 89 d7 mov %rdx,%r15
2696: 4d 89 c4 mov %r8,%r12
2699: e8 02 fa ff ff call 20a0 <filc_store_barrier_for_lower_slow@plt>
269e: 4d 89 e0 mov %r12,%r8
26a1: 4c 89 fa mov %r15,%rdx
26a4: 4c 89 f6 mov %r14,%rsi
26a7: e9 94 fe ff ff jmp 2540 <Jf_insert_sorted+0x2e0>
26ac: 48 8d 15 dd 24 00 00 lea 0x24dd(%rip),%rdx # 4b90 <Jgo_.str+0x180>
26b3: e8 38 fa ff ff call 20f0 <filc_optimized_access_check_fail@plt>
26b8: 48 8d 15 81 24 00 00 lea 0x2481(%rip),%rdx # 4b40 <Jgo_.str+0x130>
26bf: e9 46 ff ff ff jmp 260a <Jf_insert_sorted+0x3aa>
26c4: 48 89 7c 24 10 mov %rdi,0x10(%rsp)
26c9: 48 89 df mov %rbx,%rdi
26cc: 48 89 74 24 08 mov %rsi,0x8(%rsp)
26d1: 48 89 ce mov %rcx,%rsi
26d4: 4c 89 44 24 20 mov %r8,0x20(%rsp)
26d9: 48 89 4c 24 18 mov %rcx,0x18(%rsp)
26de: 49 89 c7 mov %rax,%r15
26e1: e8 ba f9 ff ff call 20a0 <filc_store_barrier_for_lower_slow@plt>
26e6: 4c 8b 0d f3 28 00 00 mov 0x28f3(%rip),%r9 # 4fe0 <filc_is_marking>
26ed: 4c 89 f8 mov %r15,%rax
26f0: 48 8b 4c 24 18 mov 0x18(%rsp),%rcx
26f5: 4c 8b 44 24 20 mov 0x20(%rsp),%r8
26fa: 48 8b 7c 24 10 mov 0x10(%rsp),%rdi
26ff: 48 ba ff ff ff ff ff movabs $0xffffffffffff,%rdx
2706: ff 00 00
2709: 48 8b 74 24 08 mov 0x8(%rsp),%rsi
270e: e9 11 fe ff ff jmp 2524 <Jf_insert_sorted+0x2c4>
2713: 48 8d 15 96 24 00 00 lea 0x2496(%rip),%rdx # 4bb0 <Jgo_.str+0x1a0>
271a: e9 eb fe ff ff jmp 260a <Jf_insert_sorted+0x3aa>
271f: 48 89 c7 mov %rax,%rdi
2722: 48 89 d6 mov %rdx,%rsi
2725: e8 b6 f9 ff ff call 20e0 <filc_check_function_call_fail@plt>
272a: 48 89 f0 mov %rsi,%rax
272d: 48 8d 15 34 23 00 00 lea 0x2334(%rip),%rdx # 4a68 <Jgo_.str+0x58>
2734: be 10 00 00 00 mov $0x10,%esi
2739: 48 89 c7 mov %rax,%rdi
273c: e8 ff f8 ff ff call 2040 <filc_cc_args_check_failure@plt>
2741: 48 8d 05 88 24 00 00 lea 0x2488(%rip),%rax # 4bd0 <Jgo_.str+0x1c0>
2748: be 08 00 00 00 mov $0x8,%esi
274d: 48 89 d7 mov %rdx,%rdi
2750: 48 89 c2 mov %rax,%rdx
2753: e8 78 f9 ff ff call 20d0 <filc_cc_rets_check_failure@plt>
Let's begin! First, the linker thunk:
0000000000002250 <pizlonated_insert_sorted>:
2250: 48 8d 05 09 00 00 00 lea 0x9(%rip),%rax # 2260 <Jf_insert_sorted>
2257: 48 8d 15 ca 29 00 00 lea 0x29ca(%rip),%rdx # 4c28 <Jfo_insert_sorted+0x10>
225e: c3 ret
Fil-C has memory-safe linking and puts all external symbols in the pizlonated_
namespace. Each linker symbol (whether code or data) is a function that returns a flight pointer (a tuple of the pointer's capability pointer and the pointer's valkue) to the global. Here, we're returning Jf_insert_sorted
- the actual implementation - as the pointer's value, and a pointer to a readonly internal global that represents the capability for that function. Jf_insert_sorted
is an internal function (the linker doesn't link calls to it).
Next, the prologue of Jf_insert_sorted
, which is the implementation of insert_sorted
:
2260: 55 push %rbp
2261: 41 57 push %r15
2263: 41 56 push %r14
2265: 41 55 push %r13
2267: 41 54 push %r12
2269: 53 push %rbx
226a: 48 83 ec 58 sub $0x58,%rsp
226e: 48 39 27 cmp %rsp,(%rdi)
2271: 0f 83 e9 fd ff ff jae 2060 <filc_stack_overflow_failure@plt>
We're saving a bunch of callee save registers using push
, then setting up some stack space by sub
ing from %rsp
(the stack pointer).
The last two instructions - the cmp
and jae
- are checking if we've overflowed the stack. Here, %rdi
is the first argument in the normal X86_64
calling convention, which Fil-C uses to pass around the thread pointer. (Issue 16: Turn the thread pointer into a proper pinned register). The full signature of a pizlonated function is:
struct pizlonated_return_value {
bool has_exception;
size_t return_size;
};
typedef pizlonated_return_value (*pizlonated_function)(
filc_thread* my_thread, size_t argument_size);
The first field in the Fil-C thread object is the stack limit. If we breach that, we jump to the special filc_stack_overflow_failure
thunk, which is engineered to work correctly even if we had blown past the end of the stack so thoroughly that %rsp
was no longer pointing at a valid page.
Next let's look a the Pizderson frame setup:
2277: 48 8b 4f 10 mov 0x10(%rdi),%rcx
227b: 48 89 4c 24 28 mov %rcx,0x28(%rsp)
2280: 48 8d 4c 24 28 lea 0x28(%rsp),%rcx
2285: 48 89 4f 10 mov %rcx,0x10(%rdi)
2289: 48 8d 0d d8 27 00 00 lea 0x27d8(%rip),%rcx # 4a68 <Jgo_.str+0x58>
2290: 48 89 4c 24 30 mov %rcx,0x30(%rsp)
2295: 0f 57 c0 xorps %xmm0,%xmm0
2298: 0f 11 44 24 38 movups %xmm0,0x38(%rsp)
229d: 0f 11 44 24 48 movups %xmm0,0x48(%rsp)
Pizderson frames allow for not-too-hilariously-expensive accurate GC with minimal compiler support and a dead-simple runtime interface (i.e. how the runtime scans the stack). Pizderson frames are like Henderson frames except they only support non-moving GC, which means that pointers in locals can still be register-allocated. (Issue 17: Implement real accurate stack scanning)
Here's the struct layout of a Fil-C Pizderson frame:
struct filc_frame {
filc_frame* parent;
const filc_origin* origin;
void* lowers[];
};
In this assembly snippet, we're first setting up parent
:
2277: 48 8b 4f 10 mov 0x10(%rdi),%rcx
227b: 48 89 4c 24 28 mov %rcx,0x28(%rsp)
The thread pointer (%rdi
) has the thread pointer, and offset 0x10
happens to be where the top frame pointer field is. We load that and store it to the parent
field of the stack-allocated filc_frame
.
Then we store the address of the Pizderson frame to the top frame pointer in the thread:
2280: 48 8d 4c 24 28 lea 0x28(%rsp),%rcx
2285: 48 89 4f 10 mov %rcx,0x10(%rdi)
And then we store the origin (compiler-generated metadata about the a specific point in the code).
2289: 48 8d 0d d8 27 00 00 lea 0x27d8(%rip),%rcx # 4a68 <Jgo_.str+0x58>
2290: 48 89 4c 24 30 mov %rcx,0x30(%rsp)
Note that we'll store other origins into this field. We'll do something like this store at every point in the code where we perform a call (either to the Fil-C runtime or to another pizlonated function). The origin also contains information about this function, such as the length of the lowers
array.
Finally, we initialize the lowers
.
2295: 0f 57 c0 xorps %xmm0,%xmm0
2298: 0f 11 44 24 38 movups %xmm0,0x38(%rsp)
229d: 0f 11 44 24 48 movups %xmm0,0x48(%rsp)
In Fil-C runtime and compiler jargon, the lower
is a particular representation of the capability, where we point just above the filc_object
(i.e. the capability object), which also happens to be the lower bounds. If you see accesses at negative constant offsets as big as -16, then those are likely to be filc_object
accesses. Since this function doesn't yet have any pointers that need to be tracked by GC, this is just initialized to zero. Whenever a pointer value is produced, it'll be stored into somewhere in this lowers
array. (Issue 18: Optimize how the compiler emits stores to the filc_frame lowers array)
Next, we process insert_sorted
's arguments, i.e. struct list** list_head, int value
:
22a2: 48 83 fe 0f cmp $0xf,%rsi
22a6: 0f 86 7e 04 00 00 jbe 272a <Jf_insert_sorted+0x4ca>
22ac: 48 89 fb mov %rdi,%rbx
22af: 4c 8b af 80 00 00 00 mov 0x80(%rdi),%r13
22b6: 8b 87 88 00 00 00 mov 0x88(%rdi),%eax
22bc: 89 44 24 08 mov %eax,0x8(%rsp)
22c0: 4c 8b a7 80 01 00 00 mov 0x180(%rdi),%r12
Recall that in the Fil-C calling convention, the second argument (%rsi
) is the "argument size":
typedef pizlonated_return_value (*pizlonated_function)(
filc_thread* my_thread, size_t argument_size);
This is the number of bytes worth of arguments being passed. Arguments are passed in a thread-local buffer that starts at offset 0x80
from the thread pointer. Each primitive argument is passed using its corresponding 64-bit "canonical" representation (so an int16_t
would be passed as a int64_t
and a float
would be passed as a double
). Hence, insert_sorted
takes 16 bytes worth of arguments, and this code is branching away with a jbe
if the argument size is <= 0xf
.
Next it looks like we're copying the thread pointer into a callee-save register:
22ac: 48 89 fb mov %rdi,%rbx
And finally we're loading the arguments:
22af: 4c 8b af 80 00 00 00 mov 0x80(%rdi),%r13
22b6: 8b 87 88 00 00 00 mov 0x88(%rdi),%eax
22bc: 89 44 24 08 mov %eax,0x8(%rsp)
22c0: 4c 8b a7 80 01 00 00 mov 0x180(%rdi),%r12
Seems like one of the arguments (int value
) is immediately spilled to 0x8(%rsp)
. Notice that there are three loads total. This is because there is a second buffer for the arguments' capabilities. Since struct list** list_head
is a pointer, we load its lower
from that buffer (at offset 0x180
).
Fil-C's calling convention allows for maximally sloppy function pointer casts. If the callsite's signature disagrees with the callee's signature, then stuff just tends to work. It's not clear that this is a requirement of Fil-C, and even if it was, this thread-local buffer for arguments (and return values, as we'll see later) is hilariously inefficient. (Issue 22: The Fil-C calling convention should be optimized)
malloc
Next we get the pointer to malloc
:
22c7: 48 8d 35 02 29 00 00 lea 0x2902(%rip),%rsi # 4bd0 <Jgo_.str+0x1c0>
22ce: e8 ed fd ff ff call 20c0 <pizlonated_malloc@plt>
The pizlonated_malloc
function just returns the pointer to malloc
along with its capability. These are now in %rax
(the pointer) and %rdx
(its capability; specifically, its lower).
The signature of a pizlonated getter is:
typedef filc_ptr (*pizlonated_getter)(
filc_thread* my_thread, const filc_origin* passed_origin);
In this call to pizlonated_malloc
, we still had the thread pointer in %rdi
, so that got passed as the first argument. And we materialized an origin and passed it as the passed_origin
argument via %rsi
. This is in case the getter has to be able to print a stack trace, which could happen if we're calling an ifunc (yes, Fil-C supports ifuncs and they are memory-safe).
Now we set up the arguments to the call to malloc
. Remember, we're doing:
struct list* node = (struct list*)malloc(sizeof(struct list));
sizeof(struct list)
is 16 bytes. This is the code that passes the argument:
22d3: 48 c7 83 80 00 00 00 movq $0x10,0x80(%rbx)
22da: 10 00 00 00
22de: 48 c7 83 80 01 00 00 movq $0x0,0x180(%rbx)
22e5: 00 00 00 00
We're storing 16 as the argument's value, and zero as the argument's lower (since it's not a pointer). This ensures that if malloc
was really a void* (*)(void* whatever)
, then any accesses on whatever
would fail due to the null capability check.
Next we store the malloc
callsite's origin to our Pizderson frame:
22e9: 48 8d 0d 18 29 00 00 lea 0x2918(%rip),%rcx # 4c08 <Jgo_.str+0x1f8>
22f0: 48 89 4c 24 30 mov %rcx,0x30(%rsp)
Then we do the function call check. This checks that the function pointer we're about to call is really pointing at a function:
22f5: 48 85 d2 test %rdx,%rdx
22f8: 0f 84 21 04 00 00 je 271f <Jf_insert_sorted+0x4bf>
22fe: 48 8b 4a f8 mov -0x8(%rdx),%rcx
2302: 48 be 00 00 00 00 00 movabs $0x3c0000000000000,%rsi
2309: 00 c0 03
230c: 48 21 ce and %rcx,%rsi
230f: 48 bf 00 00 00 00 00 movabs $0x40000000000000,%rdi
2316: 00 40 00
2319: 48 39 fe cmp %rdi,%rsi
231c: 0f 85 fd 03 00 00 jne 271f <Jf_insert_sorted+0x4bf>
2322: 48 be ff ff ff ff ff movabs $0xffffffffffff,%rsi
2329: ff 00 00
232c: 48 21 f1 and %rsi,%rcx
232f: 48 39 c8 cmp %rcx,%rax
2332: 0f 85 e7 03 00 00 jne 271f <Jf_insert_sorted+0x4bf>
This is a super gross check! (Issue 19: InvisiCaps 2.0: Remove flag bits from the aux ptr, never have null aux pointers, and stop using the low bits of lowers for atomicity tricks).
Let's break it down. First we have to check that we don't have the null capability:
22f5: 48 85 d2 test %rdx,%rdx
22f8: 0f 84 21 04 00 00 je 271f <Jf_insert_sorted+0x4bf>
Then we have to load the aux pointer from the capability:
22fe: 48 8b 4a f8 mov -0x8(%rdx),%rcx
This pointer has flags in the high 16 bits. First we're extracing the flag bits of interest to us into %rsi
:
2302: 48 be 00 00 00 00 00 movabs $0x3c0000000000000,%rsi
2309: 00 c0 03
230c: 48 21 ce and %rcx,%rsi
And then we're comparing that the flags to the value they would have for a function capability:
230f: 48 bf 00 00 00 00 00 movabs $0x40000000000000,%rdi
2316: 00 40 00
2319: 48 39 fe cmp %rdi,%rsi
Finally, we're masking out the actual aux pointer:
2322: 48 be ff ff ff ff ff movabs $0xffffffffffff,%rsi
2329: ff 00 00
232c: 48 21 f1 and %rsi,%rcx
And comparing it to the function pointer:
232f: 48 39 c8 cmp %rcx,%rax
2332: 0f 85 e7 03 00 00 jne 271f <Jf_insert_sorted+0x4bf>
This ensures that the function pointer really points at the function. (Issue 20: Function capabilities returned from pizlonated getters are never offset)
Finally we make the call and extract the return value:
2338: be 08 00 00 00 mov $0x8,%esi
233d: 48 89 df mov %rbx,%rdi
2340: ff d0 call *%rax
2342: a8 01 test $0x1,%al
2344: 0f 85 00 03 00 00 jne 264a <Jf_insert_sorted+0x3ea>
234a: 48 83 fa 07 cmp $0x7,%rdx
234e: 0f 86 ed 03 00 00 jbe 2741 <Jf_insert_sorted+0x4e1>
2354: 48 8b bb 80 00 00 00 mov 0x80(%rbx),%rdi
235b: 48 8b b3 80 01 00 00 mov 0x180(%rbx),%rsi
The two arguments to any pizlonated function are filc_thread* my_thread, size_t argument_size
. Here we pass 8 as the second argument (the argument size, in %esi
), since we're only passing one primitive argument. And we pass our thread pointer as the first argument (%rdi
). Then we actually call malloc
:
2340: ff d0 call *%rax
And check if it "threw an exception":
2342: a8 01 test $0x1,%al
2344: 0f 85 00 03 00 00 jne 264a <Jf_insert_sorted+0x3ea>
Pizlonated functions return the following:
struct pizlonated_return_value {
bool has_exception; /* returned in %rax */
size_t return_size; /* returned in %rdx */
};
So, our first check is that malloc
isn't "throwing an exception", which really means either:
_Unwind_RaiseException
, or_Unwind_ForcedUnwind
.All C functions support the latter, so even C code has to defend against this. (Issue 21: Implement Fil-C's unwinding in terms of native unwinding)
After that, we check that it indeed returned a value:
234a: 48 83 fa 07 cmp $0x7,%rdx
234e: 0f 86 ed 03 00 00 jbe 2741 <Jf_insert_sorted+0x4e1>
Specifically, we're branching away if the return size is <= 0x7
. And then we finish making the call by loading the returned pointer and its capability from the thread's buffer:
2354: 48 8b bb 80 00 00 00 mov 0x80(%rbx),%rdi
235b: 48 8b b3 80 01 00 00 mov 0x180(%rbx),%rsi
The last instruction stores the node
into the Pizderson frame:
2362: 48 89 74 24 38 mov %rsi,0x38(%rsp)
It's pretty incredible that it takes so much work to call malloc
. So much of this could be optimized away. But even separately from general function call optimizations, the compiler could special-case malloc
. (Issue 23: Turn malloc calls into direct calls into the GC allocator)
value
Now we're at:
node->value = value;
This is what it takes to do that store:
2367: 48 85 f6 test %rsi,%rsi
236a: 44 8b 54 24 08 mov 0x8(%rsp),%r10d
236f: 0f 84 b9 02 00 00 je 262e <Jf_insert_sorted+0x3ce>
2375: f6 46 fe 06 testb $0x6,-0x2(%rsi)
2379: 0f 85 af 02 00 00 jne 262e <Jf_insert_sorted+0x3ce>
237f: 48 8d 47 08 lea 0x8(%rdi),%rax
2383: 48 39 f0 cmp %rsi,%rax
2386: 0f 82 a2 02 00 00 jb 262e <Jf_insert_sorted+0x3ce>
238c: 48 8b 4e f0 mov -0x10(%rsi),%rcx
2390: 48 83 c1 fc add $0xfffffffffffffffc,%rcx
2394: 48 39 c8 cmp %rcx,%rax
2397: 0f 87 91 02 00 00 ja 262e <Jf_insert_sorted+0x3ce>
239d: 44 89 10 mov %r10d,(%rax)
First we are checking that node
's capability isn't null:
2367: 48 85 f6 test %rsi,%rsi
236a: 44 8b 54 24 08 mov 0x8(%rsp),%r10d
236f: 0f 84 b9 02 00 00 je 262e <Jf_insert_sorted+0x3ce>
The compiler is being clever here, and it's interleaving the check with a fill of int value
(it had previously been spilled at 0x8(%rsp)
).
Then we check that node
's capability is not readonly:
2375: f6 46 fe 06 testb $0x6,-0x2(%rsi)
2379: 0f 85 af 02 00 00 jne 262e <Jf_insert_sorted+0x3ce>
Note that if we fix Issue 23 then we won't have to do most of these checks, since we'll know that the allocator returned a valid object that isn't readonly. But anyways, next we materialize &node->value
and check that it's above lower bound:
237f: 48 8d 47 08 lea 0x8(%rdi),%rax
2383: 48 39 f0 cmp %rsi,%rax
2386: 0f 82 a2 02 00 00 jb 262e <Jf_insert_sorted+0x3ce>
Recall that capabilities are represented by the runtime and compiler using a lower
, i.e. the lower bound, which also happens to be a pointer to the top of filc_object
. Hence why the flag check was at -0x2(%rsi)
and the lower bound is just %rsi
.
Then we check that the upper bound minus 4 (sizeof(node->value)
) is greater than or equal to &node->value
(and if it was below, then we branch away to a check fail).
Finally we store value
into node->value
:
239d: 44 89 10 mov %r10d,(%rax)
Now we're at the loop:
for (next_ptr = list_head; ; next_ptr = &(*next_ptr)->next) {
if (!*next_ptr || (*next_ptr)->value > value) {
node->next = *next_ptr;
*next_ptr = node;
return node;
}
}
But before we get into the loop body, we have the loop's pre-header (the code that executes before we start looping):
23a0: 4c 89 64 24 40 mov %r12,0x40(%rsp)
23a5: 4d 85 e4 test %r12,%r12
23a8: 0f 84 55 02 00 00 je 2603 <Jf_insert_sorted+0x3a3>
23ae: 49 89 f8 mov %rdi,%r8
23b1: 4c 8d 4e f0 lea -0x10(%rsi),%r9
23b5: 4c 8d 1d 9c 26 00 00 lea 0x269c(%rip),%r11 # 4a58 <Jgo_.str+0x48>
23bc: 48 ba ff ff ff ff ff movabs $0xffffffffffff,%rdx
23c3: ff 00 00
23c6: 41 f6 c5 07 test $0x7,%r13b
23ca: 0f 85 33 02 00 00 jne 2603 <Jf_insert_sorted+0x3a3>
The first instruction duplicates the capability of list_head
into the Pizderson frame:
23a0: 4c 89 64 24 40 mov %r12,0x40(%rsp)
This enables the GC to track the object. Note that this is superfluous, since the Fil-C calling convention requires that arguments are tracked by the caller. (Issue 18)
Next we check that list_head
has a valid capability:
23a5: 4d 85 e4 test %r12,%r12
23a8: 0f 84 55 02 00 00 je 2603 <Jf_insert_sorted+0x3a3>
We make a copy of the node
pointer:
23ae: 49 89 f8 mov %rdi,%r8
And we save the filc_object*
(as opposed to the lower
) of the node
capability into another register:
23b1: 4c 8d 4e f0 lea -0x10(%rsi),%r9
These two instructions are almost certainly register allocation goofs on LLVM's part.
Then stash the origin we'll use for the pollcheck that's in the loop (I'll explain that later):
23b5: 4c 8d 1d 9c 26 00 00 lea 0x269c(%rip),%r11 # 4a58 <Jgo_.str+0x48>
This is totally pointless; we're taking up a perfectly good register for now good reason. (Issue 24: LLVM does a bad job of register allocation for origins)
Then we stash a mask we'll use a lot:
23bc: 48 ba ff ff ff ff ff movabs $0xffffffffffff,%rdx
23c3: ff 00 00
We wouldn't need this mask if we removed the flags from the aux pointer in InvisiCaps. (Issue 19)
Then we check that list_head
is pointer-aligned:
23c6: 41 f6 c5 07 test $0x7,%r13b
23ca: 0f 85 33 02 00 00 jne 2603 <Jf_insert_sorted+0x3a3>
Now we're at the actual loop. Note that location 23d0
is the loop header (we'll branch back here). Note that next_ptr = list_head
has implicitly happened (we're reusing the same registers that held list_head
's pointer value and capability to be next_ptr
's pointer value and capability).
23d0: 4c 89 ed mov %r13,%rbp
23d3: 4c 29 e5 sub %r12,%rbp
23d6: 0f 82 27 02 00 00 jb 2603 <Jf_insert_sorted+0x3a3>
23dc: 4d 3b 6c 24 f0 cmp -0x10(%r12),%r13
23e1: 0f 83 1c 02 00 00 jae 2603 <Jf_insert_sorted+0x3a3>
23e7: 49 8b 44 24 f8 mov -0x8(%r12),%rax
23ec: 48 21 d0 and %rdx,%rax
23ef: 0f 84 ba 00 00 00 je 24af <Jf_insert_sorted+0x24f>
23f5: 4c 8b 3c 28 mov (%rax,%rbp,1),%r15
23f9: 41 f6 c7 01 test $0x1,%r15b
23fd: 75 67 jne 2466 <Jf_insert_sorted+0x206>
23ff: 4d 8b 75 00 mov 0x0(%r13),%r14
2403: 4c 89 7c 24 48 mov %r15,0x48(%rsp)
2408: 4d 85 f6 test %r14,%r14
240b: 0f 84 bd 00 00 00 je 24ce <Jf_insert_sorted+0x26e>
2411: 4d 85 ff test %r15,%r15
2414: 0f 84 fb 01 00 00 je 2615 <Jf_insert_sorted+0x3b5>
241a: 49 8d 46 08 lea 0x8(%r14),%rax
241e: 4c 39 f8 cmp %r15,%rax
2421: 0f 82 ee 01 00 00 jb 2615 <Jf_insert_sorted+0x3b5>
2427: 49 8b 4f f0 mov -0x10(%r15),%rcx
242b: 48 83 c1 fc add $0xfffffffffffffffc,%rcx
242f: 48 39 c8 cmp %rcx,%rax
2432: 0f 87 dd 01 00 00 ja 2615 <Jf_insert_sorted+0x3b5>
2438: 4c 89 7c 24 50 mov %r15,0x50(%rsp)
243d: 44 39 10 cmp %r10d,(%rax)
2440: 0f 8f 2f 01 00 00 jg 2575 <Jf_insert_sorted+0x315>
2446: f6 43 08 0e testb $0xe,0x8(%rbx)
244a: 75 24 jne 2470 <Jf_insert_sorted+0x210>
244c: 4c 89 7c 24 40 mov %r15,0x40(%rsp)
2451: 4d 89 f5 mov %r14,%r13
2454: 4d 89 fc mov %r15,%r12
2457: 41 f6 c5 07 test $0x7,%r13b
245b: 0f 84 6f ff ff ff je 23d0 <Jf_insert_sorted+0x170>
2461: e9 9d 01 00 00 jmp 2603 <Jf_insert_sorted+0x3a3>
The first thing the loop tries to do is to load *next_ptr
. This requires a bunch of checks, the first of which is a lower bounds check on next_ptr
:
23d0: 4c 89 ed mov %r13,%rbp
23d3: 4c 29 e5 sub %r12,%rbp
23d6: 0f 82 27 02 00 00 jb 2603 <Jf_insert_sorted+0x3a3>
Note that we're doing a sub
with a branch instead of a cmp
because we're going to need the offset from next_ptr
's lower bound to access the invisible capability for *next_ptr
. So, we'll be using %rbp
again.
Next we do an upper bounds check for next_ptr
:
23dc: 4d 3b 6c 24 f0 cmp -0x10(%r12),%r13
23e1: 0f 83 1c 02 00 00 jae 2603 <Jf_insert_sorted+0x3a3>
We do this upper bounds check by directly comparing to the upper bounds, which is enough to prove that next_ptr
points to 8 bytes, since by the time we get here we would have always done an alignment check on next_ptr
(that it's 8-byte aligned).
Then we load next_ptr
's aux pointer and check if it's not null:
23e7: 49 8b 44 24 f8 mov -0x8(%r12),%rax
23ec: 48 21 d0 and %rdx,%rax
23ef: 0f 84 ba 00 00 00 je 24af <Jf_insert_sorted+0x24f>
Checking if aux is not null wouldn't be necessary if we fixed InvisiCaps. (Issue 19) We'll need the aux pointer to load *next_ptr
's capability.
Next we load the lower
for *next_ptr
from its aux (%rax
is the base of the aux allocation and %rbp
is the offset) and check that it doesn't have the atomic box bit set:
23f5: 4c 8b 3c 28 mov (%rax,%rbp,1),%r15
23f9: 41 f6 c7 01 test $0x1,%r15b
23fd: 75 67 jne 2466 <Jf_insert_sorted+0x206>
This check wouldn't be necessary if we fixed InvisiCaps. (Issue 19)
Now we load *next_ptr
's pointer value, and stash its capability in the Pizderson frame:
23ff: 4d 8b 75 00 mov 0x0(%r13),%r14
2403: 4c 89 7c 24 48 mov %r15,0x48(%rsp)
Now we check if *next_ptr
is NULL:
2408: 4d 85 f6 test %r14,%r14
240b: 0f 84 bd 00 00 00 je 24ce <Jf_insert_sorted+0x26e>
If it's NULL, then we're exiting the loop, since from the compiler's standpoint, this code is not part of the loop (it's after the loop completes):
node->next = *next_ptr;
*next_ptr = node;
return node;
We'll get back to the machine code for this later. Inside the loop, the next thing we check (if we know that *next_ptr
is not NULL) is (*next_ptr)->value > value
. But that requires loading ->value
, which means more safety checks.
The first check we need to do to load (*next_ptr)->value
is that *next_ptr
's capability is valid:
2411: 4d 85 ff test %r15,%r15
2414: 0f 84 fb 01 00 00 je 2615 <Jf_insert_sorted+0x3b5>
Then we materialize &(*next_ptr)->value
into %rax
and check that it's above lower bound:
241a: 49 8d 46 08 lea 0x8(%r14),%rax
241e: 4c 39 f8 cmp %r15,%rax
2421: 0f 82 ee 01 00 00 jb 2615 <Jf_insert_sorted+0x3b5>
Then we check that the upper bound is at least sizeof(int)
above &(*next_ptr)->value
:
2427: 49 8b 4f f0 mov -0x10(%r15),%rcx
242b: 48 83 c1 fc add $0xfffffffffffffffc,%rcx
242f: 48 39 c8 cmp %rcx,%rax
2432: 0f 87 dd 01 00 00 ja 2615 <Jf_insert_sorted+0x3b5>
It's only at this point that we save the *next_ptr
's capability into the Pizderson frame:
2438: 4c 89 7c 24 50 mov %r15,0x50(%rsp)
This is not the optimal place to schedule this store, since it's only needed for the pollcheck. (Issue 18)
Finally we check if (*next_ptr)->value > value
, and if it is, we exit the loop:
243d: 44 39 10 cmp %r10d,(%rax)
2440: 0f 8f 2f 01 00 00 jg 2575 <Jf_insert_sorted+0x315>
Now we do the pollcheck. Fil-C's GC uses safepointing to synchronize with the mutator, which requires the mutator to periodically execute pollchecks. Loop backedges must have pollchecks. This pollcheck ended up here:
2446: f6 43 08 0e testb $0xe,0x8(%rbx)
244a: 75 24 jne 2470 <Jf_insert_sorted+0x210>
Note that there are lots of ways to optimize this check. (Issue 25: Optimize pollchecks)
Now, &(*next_ptr)->next
, which happens to be exactly equal to *next_ptr
, becomes the new next_ptr
. Here's the code that makes that happen:
244c: 4c 89 7c 24 40 mov %r15,0x40(%rsp)
2451: 4d 89 f5 mov %r14,%r13
2454: 4d 89 fc mov %r15,%r12
Note that we're again saving *next_ptr
(i.e. the new next_ptr
) into the Pizderson frame, which is dumb. (Issue 18)
Recall that at the top of the loop, next_ptr
was in %r13
(the pointer value) and %r12
(the capability).
Finally we're doing a branch back to the top of the loop, but only if next_ptr
is pointer-aligned. Otherwise, we branch away to code that handles that failure.
2457: 41 f6 c5 07 test $0x7,%r13b
245b: 0f 84 6f ff ff ff je 23d0 <Jf_insert_sorted+0x170>
2461: e9 9d 01 00 00 jmp 2603 <Jf_insert_sorted+0x3a3>
This loop has a number of branches that take us to slow paths, and those got scheduled after the loop.
The first slow path is for the case that *next_ptr
was an atomic pointer:
2466: 49 83 e7 fe and $0xfffffffffffffffe,%r15
246a: 4d 8b 7f 08 mov 0x8(%r15),%r15
246e: eb 8f jmp 23ff <Jf_insert_sorted+0x19f>
This extracts the capability from the atomic box and jumps back into the part of the loop right after the fast-case load of *next_ptr
's capability.
Next is the pollcheck slow path:
2470: 48 89 7c 24 10 mov %rdi,0x10(%rsp)
2475: 48 89 df mov %rbx,%rdi
2478: 49 89 f4 mov %rsi,%r12
247b: 4c 89 de mov %r11,%rsi
247e: 4c 89 c5 mov %r8,%rbp
2481: 4d 89 cd mov %r9,%r13
2484: e8 97 fb ff ff call 2020 <filc_pollcheck_slow@plt>
2489: 4c 8d 1d c8 25 00 00 lea 0x25c8(%rip),%r11 # 4a58 <Jgo_.str+0x48>
2490: 44 8b 54 24 08 mov 0x8(%rsp),%r10d
2495: 4d 89 e9 mov %r13,%r9
2498: 49 89 e8 mov %rbp,%r8
249b: 48 8b 7c 24 10 mov 0x10(%rsp),%rdi
24a0: 48 ba ff ff ff ff ff movabs $0xffffffffffff,%rdx
24a7: ff 00 00
24aa: 4c 89 e6 mov %r12,%rsi
24ad: eb 9d jmp 244c <Jf_insert_sorted+0x1ec>
The first instruction here saves %rdi
, i.e. the node
pointer, to the stack (which is weird, since it's already been replicated to %r8
):
2470: 48 89 7c 24 10 mov %rdi,0x10(%rsp)
Then we set up the first argument to the pollcheck, which is the thread pointer:
2475: 48 89 df mov %rbx,%rdi
Then we save the lower
for node
to a callee save:
2478: 49 89 f4 mov %rsi,%r12
And we set up the second argument to the pollcheck, which is the origin:
247b: 4c 89 de mov %r11,%rsi
Finally we save %r8
and %r9
(a copy of node
and its capability) to callee saves:
247e: 4c 89 c5 mov %r8,%rbp
2481: 4d 89 cd mov %r9,%r13
Note that it's silly that this code has to do so much work to save registers. It's likely that the fact that filc_pollcheck_slow
clobbers volatile registers also prevents the register allocator from doing everything that it would like, since this save-restore logic is almost certainly due to the greedy register allocator doing live-range-splitting, and that algorithm has a limit on how much of that it will do. So, making slow paths not clobber registers would reduce code size and free up greedy's splitting budget, which could improve register allocation overall. (Issue 27: Slow path calls to the runtime should use a calling convention that preserves all (or most) registers)
Then we call the pollcheck, restore everything clobbered by it, and jump back into the loop:
2489: 4c 8d 1d c8 25 00 00 lea 0x25c8(%rip),%r11 # 4a58 <Jgo_.str+0x48>
2490: 44 8b 54 24 08 mov 0x8(%rsp),%r10d
2495: 4d 89 e9 mov %r13,%r9
2498: 49 89 e8 mov %rbp,%r8
249b: 48 8b 7c 24 10 mov 0x10(%rsp),%rdi
24a0: 48 ba ff ff ff ff ff movabs $0xffffffffffff,%rdx
24a7: ff 00 00
24aa: 4c 89 e6 mov %r12,%rsi
24ad: eb 9d jmp 244c <Jf_insert_sorted+0x1ec>
The next slow path code is for the case that the aux ptr of next_ptr
was NULL:
24af: 4d 8b 75 00 mov 0x0(%r13),%r14
24b3: 48 c7 44 24 48 00 00 movq $0x0,0x48(%rsp)
24ba: 00 00
24bc: 31 c9 xor %ecx,%ecx
24be: 41 bf 00 00 00 00 mov $0x0,%r15d
24c4: 4d 85 f6 test %r14,%r14
24c7: 74 08 je 24d1 <Jf_insert_sorted+0x271>
24c9: e9 4a 01 00 00 jmp 2618 <Jf_insert_sorted+0x3b8>
It's a bug that this path even exists. (Issue 19) Here, we do some of the same work as when we loaded next_ptr
's aux pointer. But, we know that the lower
is NULL. So, we load the *next_ptr
's pointer value:
24af: 4d 8b 75 00 mov 0x0(%r13),%r14
Then we stash a NULL capability where we would have stashed *next_ptr
's capability:
24b3: 48 c7 44 24 48 00 00 movq $0x0,0x48(%rsp)
24ba: 00 00
This next instruction is zeroing *next_ptr
's capability pointer, which will be used later (and the code we branch to expects it in both %rcx
and %r15
):
24bc: 31 c9 xor %ecx,%ecx
24be: 41 bf 00 00 00 00 mov $0x0,%r15d
Then we check if *next_ptr
is NULL, and if so, we branch to code that is after the loop. This is the same branch we mentioned earlier, just duplicated to this part of the code. If that branch fails, we branch out to a failure path - since we know that it's impossible to load from (*next_ptr)->value
if *next_ptr
has a NULL capability.
24c7: 74 08 je 24d1 <Jf_insert_sorted+0x271>
24c9: e9 4a 01 00 00 jmp 2618 <Jf_insert_sorted+0x3b8>
This code is within the loop's scope from the standpoint of C, but it is after the loop from the standpoint of the compiler:
node->next = *next_ptr;
*next_ptr = node;
return node;
Here's the reason: compilers consider a natural loop to be all of the basic blocks backwards-reachable up to its header, from any blocks that branch to the header while being dominated by the header. This code is backwards-rachable from a return
statement, so not something that branches to the loop header, therefore it's not part of the loop.
That's likely to be the primary reason why it's scheduled after the loop by the compiler.
What makes this code path most fun to find in the generated code is that there are multiple ways to get to it (next_ptr
doesn't have an aux, *next_ptr
is NULL, or (*next_ptr)->value > value
.
24ce: 4c 89 f9 mov %r15,%rcx
24d1: 41 f6 c0 07 test $0x7,%r8b
24d5: 0f 85 63 01 00 00 jne 263e <Jf_insert_sorted+0x3de>
24db: 49 39 f0 cmp %rsi,%r8
24de: 0f 82 5a 01 00 00 jb 263e <Jf_insert_sorted+0x3de>
24e4: 48 8b 46 f8 mov -0x8(%rsi),%rax
24e8: 48 0f ba e0 32 bt $0x32,%rax
24ed: 0f 82 4b 01 00 00 jb 263e <Jf_insert_sorted+0x3de>
24f3: 41 f6 44 24 fe 02 testb $0x2,-0x2(%r12)
24f9: 0f 85 b9 01 00 00 jne 26b8 <Jf_insert_sorted+0x458>
24ff: 45 31 f6 xor %r14d,%r14d
2502: 48 21 d0 and %rdx,%rax
2505: 0f 84 aa 00 00 00 je 25b5 <Jf_insert_sorted+0x355>
250b: 48 29 f7 sub %rsi,%rdi
250e: 4c 8b 0d cb 2a 00 00 mov 0x2acb(%rip),%r9 # 4fe0 <filc_is_marking>
2515: 4d 85 ff test %r15,%r15
2518: 74 0a je 2524 <Jf_insert_sorted+0x2c4>
251a: 41 80 39 00 cmpb $0x0,(%r9)
251e: 0f 85 a0 01 00 00 jne 26c4 <Jf_insert_sorted+0x464>
2524: 48 89 0c 38 mov %rcx,(%rax,%rdi,1)
2528: 4d 89 30 mov %r14,(%r8)
252b: 49 23 54 24 f8 and -0x8(%r12),%rdx
2530: 0f 84 1d 01 00 00 je 2653 <Jf_insert_sorted+0x3f3>
2536: 41 80 39 00 cmpb $0x0,(%r9)
253a: 0f 85 4d 01 00 00 jne 268d <Jf_insert_sorted+0x42d>
2540: 48 89 34 2a mov %rsi,(%rdx,%rbp,1)
2544: 4d 89 45 00 mov %r8,0x0(%r13)
2548: 4c 89 83 80 00 00 00 mov %r8,0x80(%rbx)
254f: 48 89 b3 80 01 00 00 mov %rsi,0x180(%rbx)
2556: ba 08 00 00 00 mov $0x8,%edx
255b: 31 c0 xor %eax,%eax
255d: 48 8b 4c 24 28 mov 0x28(%rsp),%rcx
2562: 48 89 4b 10 mov %rcx,0x10(%rbx)
2566: 48 83 c4 58 add $0x58,%rsp
256a: 5b pop %rbx
256b: 41 5c pop %r12
256d: 41 5d pop %r13
256f: 41 5e pop %r14
2571: 41 5f pop %r15
2573: 5d pop %rbp
2574: c3 ret
2575: 41 f6 c0 07 test $0x7,%r8b
2579: 0f 85 2d 01 00 00 jne 26ac <Jf_insert_sorted+0x44c>
257f: 49 39 f0 cmp %rsi,%r8
2582: 0f 82 24 01 00 00 jb 26ac <Jf_insert_sorted+0x44c>
2588: 48 8b 46 f8 mov -0x8(%rsi),%rax
258c: 48 0f ba e0 32 bt $0x32,%rax
2591: 0f 82 15 01 00 00 jb 26ac <Jf_insert_sorted+0x44c>
2597: 41 f6 44 24 fe 02 testb $0x2,-0x2(%r12)
259d: 0f 85 70 01 00 00 jne 2713 <Jf_insert_sorted+0x4b3>
25a3: 4c 89 f9 mov %r15,%rcx
25a6: 41 bf 01 00 00 00 mov $0x1,%r15d
25ac: 48 21 d0 and %rdx,%rax
25af: 0f 85 56 ff ff ff jne 250b <Jf_insert_sorted+0x2ab>
25b5: 48 89 4c 24 18 mov %rcx,0x18(%rsp)
25ba: 48 8d 05 2f 25 00 00 lea 0x252f(%rip),%rax # 4af0 <Jgo_.str+0xe0>
25c1: 48 89 44 24 30 mov %rax,0x30(%rsp)
25c6: 48 89 7c 24 10 mov %rdi,0x10(%rsp)
25cb: 48 89 df mov %rbx,%rdi
25ce: 48 89 74 24 08 mov %rsi,0x8(%rsp)
25d3: 4c 89 ce mov %r9,%rsi
25d6: 4c 89 44 24 20 mov %r8,0x20(%rsp)
25db: e8 20 fb ff ff call 2100 <filc_object_ensure_aux_ptr_outline@plt>
25e0: 4c 8b 44 24 20 mov 0x20(%rsp),%r8
25e5: 48 8b 7c 24 10 mov 0x10(%rsp),%rdi
25ea: 48 ba ff ff ff ff ff movabs $0xffffffffffff,%rdx
25f1: ff 00 00
25f4: 48 8b 74 24 08 mov 0x8(%rsp),%rsi
25f9: 48 8b 4c 24 18 mov 0x18(%rsp),%rcx
25fe: e9 08 ff ff ff jmp 250b <Jf_insert_sorted+0x2ab>
The first instruction, 24ce
, is where we branch to if *next_ptr
is NULL, from the normal path of the loop. The second instruction, 24d1
, is where we branch to if next_ptr
's aux ptr is NULL (and then the only possibility is that *next_ptr
is NULL). And 2575
is where we branch to if (*next_ptr)->value > value
. This path is special because in this path, we would have loaded, and checked, a capability for *next_ptr
. In the other two paths, we would have either not loaded it and inferred it to be NULL, or not checked anything about it.
Let's start with the first two entrypoints, since they're nearly identical:
24ce: 4c 89 f9 mov %r15,%rcx
24d1: 41 f6 c0 07 test $0x7,%r8b
24d5: 0f 85 63 01 00 00 jne 263e <Jf_insert_sorted+0x3de>
24db: 49 39 f0 cmp %rsi,%r8
24de: 0f 82 5a 01 00 00 jb 263e <Jf_insert_sorted+0x3de>
24e4: 48 8b 46 f8 mov -0x8(%rsi),%rax
24e8: 48 0f ba e0 32 bt $0x32,%rax
24ed: 0f 82 4b 01 00 00 jb 263e <Jf_insert_sorted+0x3de>
24f3: 41 f6 44 24 fe 02 testb $0x2,-0x2(%r12)
24f9: 0f 85 b9 01 00 00 jne 26b8 <Jf_insert_sorted+0x458>
24ff: 45 31 f6 xor %r14d,%r14d
2502: 48 21 d0 and %rdx,%rax
2505: 0f 84 aa 00 00 00 je 25b5 <Jf_insert_sorted+0x355>
This code does some of the checks needed to execute:
node->next = *next_ptr;
*next_ptr = node;
But under the assumption that *next_ptr
's pointer value is NULL. Note, this code hardly exploits this assumption. The registers are:
%rcx
has *next_ptr
's capability pointer (i.e. lower
).
%rsi
has node
's capability pointer (i.e. lower
).
%r8
has node
's pointer value.
%r12
is next_ptr
's capability.
%r15
also has *next_ptr
's capability pointer.
So, we start by checking that node
's pointer is pointer-aligned.
24d1: 41 f6 c0 07 test $0x7,%r8b
24d5: 0f 85 63 01 00 00 jne 263e <Jf_insert_sorted+0x3de>
We wouldn't have to check that if we special-cased malloc. (Issue 23)
Then we check that node
is above its lower bound.
24db: 49 39 f0 cmp %rsi,%r8
24de: 0f 82 5a 01 00 00 jb 263e <Jf_insert_sorted+0x3de>
And we check if node
isn't free.
24e4: 48 8b 46 f8 mov -0x8(%rsi),%rax
24e8: 48 0f ba e0 32 bt $0x32,%rax
24ed: 0f 82 4b 01 00 00 jb 263e <Jf_insert_sorted+0x3de>
This is interesting - we could have instead checked the upper bound for &node->next
, but at this point we would have already checked the upper bound for &node->value
, which is higher. So all we really have to do is that the object hasn't been freed (we are obligated to observe it being freed across pollchecks), which is cheaper than checking the upper bound (it's just a bit test).
Of course, maybe this isn't actually cheaper than checking the upper bounds? (Issue 26: Emitting a is-not-free check instead of an upper bounds check is often a pessimization)
Note that now, %rax
holds the aux pointer (including its flags) for node
.
Then we check if *next_ptr
is writable, or rather that the memory pointed to by next_ptr
doesn't have the readonly bit:
24f3: 41 f6 44 24 fe 02 testb $0x2,-0x2(%r12)
24f9: 0f 85 b9 01 00 00 jne 26b8 <Jf_insert_sorted+0x458>
Then we set *next_ptr
's value to NULL:
24ff: 45 31 f6 xor %r14d,%r14d
This is because we know on this path that *next_ptr
must be NULL. Later code will expect *next_ptr
value to be in %r14
, NULL or not.
Finally we check if node
's aux is not NULL (ignoring the high 16 bits):
2502: 48 21 d0 and %rdx,%rax
2505: 0f 84 aa 00 00 00 je 25b5 <Jf_insert_sorted+0x355>
We'll discuss the slow path that this goes to in a little bit. The next code is a branch destination coming from code that handles the (*next_ptr)->value > value
case.
250b: 48 29 f7 sub %rsi,%rdi
250e: 4c 8b 0d cb 2a 00 00 mov 0x2acb(%rip),%r9 # 4fe0 <filc_is_marking>
2515: 4d 85 ff test %r15,%r15
2518: 74 0a je 2524 <Jf_insert_sorted+0x2c4>
251a: 41 80 39 00 cmpb $0x0,(%r9)
251e: 0f 85 a0 01 00 00 jne 26c4 <Jf_insert_sorted+0x464>
2524: 48 89 0c 38 mov %rcx,(%rax,%rdi,1)
2528: 4d 89 30 mov %r14,(%r8)
252b: 49 23 54 24 f8 and -0x8(%r12),%rdx
2530: 0f 84 1d 01 00 00 je 2653 <Jf_insert_sorted+0x3f3>
2536: 41 80 39 00 cmpb $0x0,(%r9)
253a: 0f 85 4d 01 00 00 jne 268d <Jf_insert_sorted+0x42d>
2540: 48 89 34 2a mov %rsi,(%rdx,%rbp,1)
2544: 4d 89 45 00 mov %r8,0x0(%r13)
2548: 4c 89 83 80 00 00 00 mov %r8,0x80(%rbx)
254f: 48 89 b3 80 01 00 00 mov %rsi,0x180(%rbx)
2556: ba 08 00 00 00 mov $0x8,%edx
255b: 31 c0 xor %eax,%eax
255d: 48 8b 4c 24 28 mov 0x28(%rsp),%rcx
2562: 48 89 4b 10 mov %rcx,0x10(%rbx)
2566: 48 83 c4 58 add $0x58,%rsp
256a: 5b pop %rbx
256b: 41 5c pop %r12
256d: 41 5d pop %r13
256f: 41 5e pop %r14
2571: 41 5f pop %r15
2573: 5d pop %rbp
2574: c3 ret
Let's review register state:
%rax
has the aux pointer of node
.
%rbx
is the thread pointer.
%rcx
has *next_ptr
's capability pointer (i.e. lower
).
%rdx
has the 0xffffffffffff
mask (i.e. the low 48 bits mask for aux pointers).
%rbp
is the offset of *next_ptr
- i.e. the pointer value minus the lower
.
%rdi
has node
's pointer value.
%rsi
has node
's capability pointer (i.e. lower
).
%r8
also has node
's pointer value.
%r12
is next_ptr
's capability.
%r13
is next_ptr
's pointer value.
%r14
has *next_ptr
's pointer value.
%r15
also has *next_ptr
's capability pointer, but it's only used to check if the capability is not NULL. When we come to this code from the path that handles (*next_ptr)->value > value
, %r15
will just be 1 (so that we know that the capability isn't NULL). When we actually use the capability, we'll use %rcx
.
Now let's walk through this last bit of code to see what it does. First we compute the offset of the node
pointer:
250b: 48 29 f7 sub %rsi,%rdi
We'll need this to store to node
's aux, when doing node->next = *next_ptr
. So, %rdi
is no longer node's pointer value. Luckily
%r8` also has the original pointer value (and we'll need it).
The next bit is the store barrier for FUGC.
250e: 4c 8b 0d cb 2a 00 00 mov 0x2acb(%rip),%r9 # 4fe0 <filc_is_marking>
2515: 4d 85 ff test %r15,%r15
2518: 74 0a je 2524 <Jf_insert_sorted+0x2c4>
251a: 41 80 39 00 cmpb $0x0,(%r9)
251e: 0f 85 a0 01 00 00 jne 26c4 <Jf_insert_sorted+0x464>
This is what the barrier looks like in C:
static inline void filc_store_barrier(filc_thread* my_thread, filc_object* target)
{
if (PAS_UNLIKELY(filc_is_marking) && target)
filc_store_barrier_slow(my_thread, target);
}
Here, target
is %r15
, i.e. *next_ptr
's capability. First we check that it's not NULL (if it is, we skip the barrier), and then we check if filc_is_marking
(if it is, we go to a slow path that calls filc_store_barrier_slow
). Note we'll keep using %r9
, because we'll have another barrier, so we'll again have to check if filc_is_marking
. It's legal to cache this value up to any safepoint (like a call or pollcheck).
Then we can actually do the store for node->next = *next_ptr
:
2524: 48 89 0c 38 mov %rcx,(%rax,%rdi,1)
2528: 4d 89 30 mov %r14,(%r8)
We're storing *next_ptr
's capability into node
's aux at the offset of node->next
, and we're storing *next_ptr
's pointer value into node->next
itself.
Next we load next_ptr
's aux, mask it (%rdx
had the mask), and check if it's NULL. If it's NULL, we jump to a slow path.
252b: 49 23 54 24 f8 and -0x8(%r12),%rdx
2530: 0f 84 1d 01 00 00 je 2653 <Jf_insert_sorted+0x3f3>
Now, %rdx
is known to have a non-NULL aux pointer (with flags masked off) for next_ptr
.
Next we check if we're marking again, because if we are, we'll have to run a barrier with target
being node
. We know that node
has a non-NULL capability already.
2536: 41 80 39 00 cmpb $0x0,(%r9)
253a: 0f 85 4d 01 00 00 jne 268d <Jf_insert_sorted+0x42d>
Finally, we can do the store for *next_ptr = node
:
2540: 48 89 34 2a mov %rsi,(%rdx,%rbp,1)
2544: 4d 89 45 00 mov %r8,0x0(%r13)
Again, since this is a pointer, we store the capability into the invisible capability location in next_ptr
's aux and then we store the pointer value into *next_ptr
itself.
Then we set up the return value. This function returns node
.
2548: 4c 89 83 80 00 00 00 mov %r8,0x80(%rbx)
254f: 48 89 b3 80 01 00 00 mov %rsi,0x180(%rbx)
2556: ba 08 00 00 00 mov $0x8,%edx
255b: 31 c0 xor %eax,%eax
This returns 0x8
as the second return value (i.e. the return_size
). And it returns false
for the first return value (i.e. has_exception
).
Finally we tear down the Pizderson frame, by retrieving the current frame's parent
and storing it into the thread's top frame:
255d: 48 8b 4c 24 28 mov 0x28(%rsp),%rcx
2562: 48 89 4b 10 mov %rcx,0x10(%rbx)
And then we can actually return:
2566: 48 83 c4 58 add $0x58,%rsp
256a: 5b pop %rbx
256b: 41 5c pop %r12
256d: 41 5d pop %r13
256f: 41 5e pop %r14
2571: 41 5f pop %r15
2573: 5d pop %rbp
2574: c3 ret
But that's not all, since so far we've only covered the case where we get to this path if *next_ptr
is NULL or if next_ptr
's aux ptr is NULL. Here's the code we branch to if next_ptr
has a non-NULL aux, *next_ptr
is not NULL, and (*next_ptr)->value > value
:
2575: 41 f6 c0 07 test $0x7,%r8b
2579: 0f 85 2d 01 00 00 jne 26ac <Jf_insert_sorted+0x44c>
257f: 49 39 f0 cmp %rsi,%r8
2582: 0f 82 24 01 00 00 jb 26ac <Jf_insert_sorted+0x44c>
2588: 48 8b 46 f8 mov -0x8(%rsi),%rax
258c: 48 0f ba e0 32 bt $0x32,%rax
2591: 0f 82 15 01 00 00 jb 26ac <Jf_insert_sorted+0x44c>
2597: 41 f6 44 24 fe 02 testb $0x2,-0x2(%r12)
259d: 0f 85 70 01 00 00 jne 2713 <Jf_insert_sorted+0x4b3>
25a3: 4c 89 f9 mov %r15,%rcx
25a6: 41 bf 01 00 00 00 mov $0x1,%r15d
25ac: 48 21 d0 and %rdx,%rax
25af: 0f 85 56 ff ff ff jne 250b <Jf_insert_sorted+0x2ab>
25b5: 48 89 4c 24 18 mov %rcx,0x18(%rsp)
25ba: 48 8d 05 2f 25 00 00 lea 0x252f(%rip),%rax # 4af0 <Jgo_.str+0xe0>
25c1: 48 89 44 24 30 mov %rax,0x30(%rsp)
25c6: 48 89 7c 24 10 mov %rdi,0x10(%rsp)
25cb: 48 89 df mov %rbx,%rdi
25ce: 48 89 74 24 08 mov %rsi,0x8(%rsp)
25d3: 4c 89 ce mov %r9,%rsi
25d6: 4c 89 44 24 20 mov %r8,0x20(%rsp)
25db: e8 20 fb ff ff call 2100 <filc_object_ensure_aux_ptr_outline@plt>
25e0: 4c 8b 44 24 20 mov 0x20(%rsp),%r8
25e5: 48 8b 7c 24 10 mov 0x10(%rsp),%rdi
25ea: 48 ba ff ff ff ff ff movabs $0xffffffffffff,%rdx
25f1: ff 00 00
25f4: 48 8b 74 24 08 mov 0x8(%rsp),%rsi
25f9: 48 8b 4c 24 18 mov 0x18(%rsp),%rcx
25fe: e9 08 ff ff ff jmp 250b <Jf_insert_sorted+0x2ab>
This code redoes much of the same checks that we did for the other entrypoints:
2575: 41 f6 c0 07 test $0x7,%r8b
2579: 0f 85 2d 01 00 00 jne 26ac <Jf_insert_sorted+0x44c>
257f: 49 39 f0 cmp %rsi,%r8
2582: 0f 82 24 01 00 00 jb 26ac <Jf_insert_sorted+0x44c>
2588: 48 8b 46 f8 mov -0x8(%rsi),%rax
258c: 48 0f ba e0 32 bt $0x32,%rax
2591: 0f 82 15 01 00 00 jb 26ac <Jf_insert_sorted+0x44c>
2597: 41 f6 44 24 fe 02 testb $0x2,-0x2(%r12)
259d: 0f 85 70 01 00 00 jne 2713 <Jf_insert_sorted+0x4b3>
25a3: 4c 89 f9 mov %r15,%rcx
25a6: 41 bf 01 00 00 00 mov $0x1,%r15d
25ac: 48 21 d0 and %rdx,%rax
25af: 0f 85 56 ff ff ff jne 250b <Jf_insert_sorted+0x2ab>
First we check that node
's pointer is pointer-aligned:
2575: 41 f6 c0 07 test $0x7,%r8b
2579: 0f 85 2d 01 00 00 jne 26ac <Jf_insert_sorted+0x44c>
Then we check that node
is above is lower bound:
257f: 49 39 f0 cmp %rsi,%r8
2582: 0f 82 24 01 00 00 jb 26ac <Jf_insert_sorted+0x44c>
Then we check that node
isn't free:
2588: 48 8b 46 f8 mov -0x8(%rsi),%rax
258c: 48 0f ba e0 32 bt $0x32,%rax
2591: 0f 82 15 01 00 00 jb 26ac <Jf_insert_sorted+0x44c>
And if *next_ptr
is writable:
2597: 41 f6 44 24 fe 02 testb $0x2,-0x2(%r12)
259d: 0f 85 70 01 00 00 jne 2713 <Jf_insert_sorted+0x4b3>
Now we make sure that %rcx
has a copy of *next_ptr
's capability pointer.
25a3: 4c 89 f9 mov %r15,%rcx
This next part is goofy; it's an example of how a compiler's complexity often leads to bizarre emergent behavior.
25a6: 41 bf 01 00 00 00 mov $0x1,%r15d
Along most paths, %r15
is *next_ptr
's capability pointer, and %rcx
also has a copy of exactly the same value. We're about to branch to code that will only use %r15
to branch on whether the capability is NULL or not. We actually know already that the capability isn't NULL. So, this sets it to 1. Note that if we removed this instruction, then nothing would change, since %r15
is non-NULL already.
Then we make sure that %rax
, which contains the aux pointer for node
, is masked so that the high bits are zero (the flags are no longer there so the pointer can be dereferenced). Recall that we had stashed the mask in %rdx
already. Upon masking, we branch to 250b
- i.e. the code we analyzed previously that finishes the store and returns from insert_sorted
if the aux pointer is not NULL.
25ac: 48 21 d0 and %rdx,%rax
25af: 0f 85 56 ff ff ff jne 250b <Jf_insert_sorted+0x2ab>
If it is NULL, then we fall through to this code, which initializes the aux pointer for node
.
25b5: 48 89 4c 24 18 mov %rcx,0x18(%rsp)
25ba: 48 8d 05 2f 25 00 00 lea 0x252f(%rip),%rax # 4af0 <Jgo_.str+0xe0>
25c1: 48 89 44 24 30 mov %rax,0x30(%rsp)
25c6: 48 89 7c 24 10 mov %rdi,0x10(%rsp)
25cb: 48 89 df mov %rbx,%rdi
25ce: 48 89 74 24 08 mov %rsi,0x8(%rsp)
25d3: 4c 89 ce mov %r9,%rsi
25d6: 4c 89 44 24 20 mov %r8,0x20(%rsp)
25db: e8 20 fb ff ff call 2100 <filc_object_ensure_aux_ptr_outline@plt>
25e0: 4c 8b 44 24 20 mov 0x20(%rsp),%r8
25e5: 48 8b 7c 24 10 mov 0x10(%rsp),%rdi
25ea: 48 ba ff ff ff ff ff movabs $0xffffffffffff,%rdx
25f1: ff 00 00
25f4: 48 8b 74 24 08 mov 0x8(%rsp),%rsi
25f9: 48 8b 4c 24 18 mov 0x18(%rsp),%rcx
25fe: e9 08 ff ff ff jmp 250b <Jf_insert_sorted+0x2ab>
Most of this code is saving and restoring registers, which isn't needed if we used the right calling convention for this slow path. (Issue 27)
Let's break down what's happening here. First, we save %rcx
since we'll need it after the call returns and %rcx
is a volatile register.
25b5: 48 89 4c 24 18 mov %rcx,0x18(%rsp)
Then we set up the origin. This is necessary because filc_object_ensure_aux_ptr_outline
has safety checks that may lead to the stack being dumped.
25ba: 48 8d 05 2f 25 00 00 lea 0x252f(%rip),%rax # 4af0 <Jgo_.str+0xe0>
25c1: 48 89 44 24 30 mov %rax,0x30(%rsp)
Then next instructions save volatile registers while also setting up the arguments to filc_object_ensure_aux_ptr_outline
:
25c6: 48 89 7c 24 10 mov %rdi,0x10(%rsp)
25cb: 48 89 df mov %rbx,%rdi
25ce: 48 89 74 24 08 mov %rsi,0x8(%rsp)
25d3: 4c 89 ce mov %r9,%rsi
25d6: 4c 89 44 24 20 mov %r8,0x20(%rsp)
The first argument (%rdi
) is the thread pointer, and the second argument (%rsi
) is node
's capability, represented specifically as its filc_object*
rather than the lower (which had previously been in %r9
).
Now we can make the call, and when it returns, we restore everything we saved and jump back to 250b
, which finishes doing the stores and returns from insert_sorted
:
25db: e8 20 fb ff ff call 2100 <filc_object_ensure_aux_ptr_outline@plt>
25e0: 4c 8b 44 24 20 mov 0x20(%rsp),%r8
25e5: 48 8b 7c 24 10 mov 0x10(%rsp),%rdi
25ea: 48 ba ff ff ff ff ff movabs $0xffffffffffff,%rdx
25f1: ff 00 00
25f4: 48 8b 74 24 08 mov 0x8(%rsp),%rsi
25f9: 48 8b 4c 24 18 mov 0x18(%rsp),%rcx
25fe: e9 08 ff ff ff jmp 250b <Jf_insert_sorted+0x2ab>
All that's left at this point are slow paths and check fail paths.
Let's look at a slow path that we branched to in the above code. Specifically, if next_ptr
's aux was NULL:
2653: 49 83 c4 f0 add $0xfffffffffffffff0,%r12
2657: 48 8d 05 d2 24 00 00 lea 0x24d2(%rip),%rax # 4b30 <Jgo_.str+0x120>
265e: 48 89 44 24 30 mov %rax,0x30(%rsp)
2663: 48 89 df mov %rbx,%rdi
2666: 49 89 f6 mov %rsi,%r14
2669: 4c 89 e6 mov %r12,%rsi
266c: 4d 89 c7 mov %r8,%r15
266f: 4d 89 cc mov %r9,%r12
2672: e8 89 fa ff ff call 2100 <filc_object_ensure_aux_ptr_outline@plt>
2677: 4d 89 e1 mov %r12,%r9
267a: 4d 89 f8 mov %r15,%r8
267d: 4c 89 f6 mov %r14,%rsi
2680: 48 89 c2 mov %rax,%rdx
2683: 41 80 39 00 cmpb $0x0,(%r9)
2687: 0f 84 b3 fe ff ff je 2540 <Jf_insert_sorted+0x2e0>
268d: 48 89 df mov %rbx,%rdi
2690: 49 89 f6 mov %rsi,%r14
2693: 49 89 d7 mov %rdx,%r15
2696: 4d 89 c4 mov %r8,%r12
2699: e8 02 fa ff ff call 20a0 <filc_store_barrier_for_lower_slow@plt>
269e: 4d 89 e0 mov %r12,%r8
26a1: 4c 89 fa mov %r15,%rdx
26a4: 4c 89 f6 mov %r14,%rsi
26a7: e9 94 fe ff ff jmp 2540 <Jf_insert_sorted+0x2e0>
Again, we're calling filc_object_ensure_aux_ptr_outline
. To do that we need to first get %r12
- next_ptr
's capability - to point at the filc_object
rather than the lower, so we subtract 16 (
sizeof(filc_object)`):
2653: 49 83 c4 f0 add $0xfffffffffffffff0,%r12
Then we set up the origin:
2657: 48 8d 05 d2 24 00 00 lea 0x24d2(%rip),%rax # 4b30 <Jgo_.str+0x120>
265e: 48 89 44 24 30 mov %rax,0x30(%rsp)
And finally we save registers while setting up the first argument to be the thread and the second argument to be next_ptr
's object:
2663: 48 89 df mov %rbx,%rdi
2666: 49 89 f6 mov %rsi,%r14
2669: 4c 89 e6 mov %r12,%rsi
266c: 4d 89 c7 mov %r8,%r15
266f: 4d 89 cc mov %r9,%r12
2672: e8 89 fa ff ff call 2100 <filc_object_ensure_aux_ptr_outline@plt>
Then we restore registers:
2677: 4d 89 e1 mov %r12,%r9
267a: 4d 89 f8 mov %r15,%r8
267d: 4c 89 f6 mov %r14,%rsi
2680: 48 89 c2 mov %rax,%rdx
We could branch back to the normal path here, but instead, the compiler chose to emit the barrier check immediately.
2683: 41 80 39 00 cmpb $0x0,(%r9)
2687: 0f 84 b3 fe ff ff je 2540 <Jf_insert_sorted+0x2e0>
If the barrier isn't required, this branches to 2540
, which we saw previously - that's where we actually do the store for *next_ptr = node
. If the barrier is required, then we fall through to the barrier path for that store:
268d: 48 89 df mov %rbx,%rdi
2690: 49 89 f6 mov %rsi,%r14
2693: 49 89 d7 mov %rdx,%r15
2696: 4d 89 c4 mov %r8,%r12
2699: e8 02 fa ff ff call 20a0 <filc_store_barrier_for_lower_slow@plt>
269e: 4d 89 e0 mov %r12,%r8
26a1: 4c 89 fa mov %r15,%rdx
26a4: 4c 89 f6 mov %r14,%rsi
26a7: e9 94 fe ff ff jmp 2540 <Jf_insert_sorted+0x2e0>
The signature of filc_store_barrier_for_lower_slow
is:
void filc_store_barrier_for_lower_slow(filc_thread* my_thread, void* lower);
So, we're moving the thread pointer into the first argument (%rdi
) and the %rsi
already had the capability for node
.
And then there's another store barrier slow path, this time for node->next = *next_ptr
:
26c4: 48 89 7c 24 10 mov %rdi,0x10(%rsp)
26c9: 48 89 df mov %rbx,%rdi
26cc: 48 89 74 24 08 mov %rsi,0x8(%rsp)
26d1: 48 89 ce mov %rcx,%rsi
26d4: 4c 89 44 24 20 mov %r8,0x20(%rsp)
26d9: 48 89 4c 24 18 mov %rcx,0x18(%rsp)
26de: 49 89 c7 mov %rax,%r15
26e1: e8 ba f9 ff ff call 20a0 <filc_store_barrier_for_lower_slow@plt>
26e6: 4c 8b 0d f3 28 00 00 mov 0x28f3(%rip),%r9 # 4fe0 <filc_is_marking>
26ed: 4c 89 f8 mov %r15,%rax
26f0: 48 8b 4c 24 18 mov 0x18(%rsp),%rcx
26f5: 4c 8b 44 24 20 mov 0x20(%rsp),%r8
26fa: 48 8b 7c 24 10 mov 0x10(%rsp),%rdi
26ff: 48 ba ff ff ff ff ff movabs $0xffffffffffff,%rdx
2706: ff 00 00
2709: 48 8b 74 24 08 mov 0x8(%rsp),%rsi
270e: e9 11 fe ff ff jmp 2524 <Jf_insert_sorted+0x2c4>
Lots of the code for these slow paths would go away if we used the right calling convention for runtime calls. (Issue 27)
This path handles a total of 6 check failures, all to do with next_ptr
and list_head
:
2603: 48 8d 15 c6 24 00 00 lea 0x24c6(%rip),%rdx # 4ad0 <Jgo_.str+0xc0>
260a: 4c 89 ef mov %r13,%rdi
260d: 4c 89 e6 mov %r12,%rsi
2610: e8 db fa ff ff call 20f0 <filc_optimized_access_check_fail@plt>
All but one branches to this go to 2603
, but one branch here goes to 260a
(in that case, a different origin is supplied).
Here's the signature of filc_optimized_access_check_fail
:
PAS_NEVER_INLINE PAS_NO_RETURN void filc_optimized_access_check_fail(
filc_ptr ptr, const filc_optimized_access_check_origin* check_origin);
So, this is a tail call, and the first argument (passed in %rdi
and %rsi
) is the pointer that caused the failure (next_ptr
/list_head
here) while the second argument (passed in %rdx
) is a compiler-generated constant global data object that provides the runtime with the metadata necessary to describe the failure.
Next up, this code has two entrypoints handling four total failures:
2615: 4c 89 f9 mov %r15,%rcx
2618: 49 83 c6 08 add $0x8,%r14
261c: 48 8d 15 4d 25 00 00 lea 0x254d(%rip),%rdx # 4b70 <Jgo_.str+0x160>
2623: 4c 89 f7 mov %r14,%rdi
2626: 48 89 ce mov %rcx,%rsi
2629: e8 c2 fa ff ff call 20f0 <filc_optimized_access_check_fail@plt>
Three branches go to 2615
and one branch goes to 2618
. All of them have to do with checking *next_ptr
.
Now we handle four different check failures to do with node
:
262e: 48 83 c7 08 add $0x8,%rdi
2632: 48 8d 15 67 24 00 00 lea 0x2467(%rip),%rdx # 4aa0 <Jgo_.str+0x90>
2639: e8 b2 fa ff ff call 20f0 <filc_optimized_access_check_fail@plt>
Now we handle three other check failures for node
, this time when storing to node->next
:
263e: 48 8d 15 cb 24 00 00 lea 0x24cb(%rip),%rdx # 4b10 <Jgo_.str+0x100>
2645: e8 a6 fa ff ff call 20f0 <filc_optimized_access_check_fail@plt>
It's possible that malloc
"threw an exception", in which case we just propagate it. If the exception check after the malloc
call detects this, we branch here. This sets the return value to has_exception = true
and return_size = 0
, and then jumps to the return path:
264a: b0 01 mov $0x1,%al
264c: 31 d2 xor %edx,%edx
264e: e9 0a ff ff ff jmp 255d <Jf_insert_sorted+0x2fd>
This services three different check failures, all to do with node
:
26ac: 48 8d 15 dd 24 00 00 lea 0x24dd(%rip),%rdx # 4b90 <Jgo_.str+0x180>
26b3: e8 38 fa ff ff call 20f0 <filc_optimized_access_check_fail@plt>
Note that this expects %rdi
/%rsi
to already have node
's pointer and capability.
And then this handles exactly one failure, to do with *next_ptr
:
26b8: 48 8d 15 81 24 00 00 lea 0x2481(%rip),%rdx # 4b40 <Jgo_.str+0x130>
26bf: e9 46 ff ff ff jmp 260a <Jf_insert_sorted+0x3aa>
We're just setting up the origin and jumping to code that handles the rest of the code for calling filc_optimized_access_check_fail
.
The function call to malloc
might fail because malloc
wasn't a function pointer; those three check failures go here:
271f: 48 89 c7 mov %rax,%rdi
2722: 48 89 d6 mov %rdx,%rsi
2725: e8 b6 f9 ff ff call 20e0 <filc_check_function_call_fail@plt>
Then there's the case where insert_sorted
was not passed enough arguments; that one check goes here:
272a: 48 89 f0 mov %rsi,%rax
272d: 48 8d 15 34 23 00 00 lea 0x2334(%rip),%rdx # 4a68 <Jgo_.str+0x58>
2734: be 10 00 00 00 mov $0x10,%esi
2739: 48 89 c7 mov %rax,%rdi
273c: e8 ff f8 ff ff call 2040 <filc_cc_args_check_failure@plt>
And finally there's the case where malloc
didn't return a value; that one check failure goes here:
2741: 48 8d 05 88 24 00 00 lea 0x2488(%rip),%rax # 4bd0 <Jgo_.str+0x1c0>
2748: be 08 00 00 00 mov $0x8,%esi
274d: 48 89 d7 mov %rdx,%rdi
2750: 48 89 c2 mov %rax,%rdx
2753: e8 78 f9 ff ff call 20d0 <filc_cc_rets_check_failure@plt>
Fil-C can compile C in a way that makes the code memory-safe. This post shows how Fil-C is very comprehensive in the checking it emits. Analyzing that code shows lots of room for improvement. I filed the following issues by doing this exercise:
Issue 16: Turn the thread pointer into a proper pinned register
Issue 18: Optimize how the compiler emits stores to the filc_frame lowers array
Issue 20: Function capabilities returned from pizlonated getters are never offset
Issue 21: Implement Fil-C's unwinding in terms of native unwinding
Issue 23: Turn malloc calls into direct calls into the GC allocator
Issue 24: LLVM does a bad job of register allocation for origins
Issue 26: Emitting a is-not-free check instead of an upper bounds check is often a pessimization