Explanation of Disassembly of a Simple Fil-C Program

In this document I will explain what is going on in the machine code generated by the Fil-C compiler for the insert_sorted function in this simple program:

#include <stdlib.h>
#include <stdio.h>

struct list {
    struct list* next;
    int value;
};

__attribute__((noinline)) struct list* insert_sorted(struct list** list_head, int value)
{
    struct list* node = (struct list*)malloc(sizeof(struct list));
    node->value = value;
    struct list** next_ptr;
    for (next_ptr = list_head; ; next_ptr = &(*next_ptr)->next) {
        if (!*next_ptr || (*next_ptr)->value > value) {
            node->next = *next_ptr;
            *next_ptr = node;
            return node;
        }
    }
}

int main()
{
    struct list* list = NULL;
    unsigned i;
    unsigned value = 42;
    for (i = 100; i--;) {
        value ^= value << 13;
        value ^= value >> 17;
        value ^= value << 5;
        insert_sorted(&list, value);
    }
    for (; list; list = list->next)
        printf("%d\n", list->value);
    return 0;
}

The goal of this document is to explain to you how Fil-C works today. Since it's such a young implementation, Fil-C has lots of optimization opportunities. This document links to GH issues for all of the optimization opportunities it identifies.

I compiled this program with my build of the Fil-C compiler (revision 61e304b42d01e3fc132dac802fe3f4b2c299653d, not far off from Fil-C release 0.668.3):

build/bin/clang -o test43 test43.c -O2 -g

And then dumped the disassembly with:

objdump -d test43

Here's the full disassembly of insert_sorted. Don't worry if you don't get it immediately; I'll explain it instruction by instruction.

0000000000002250 <pizlonated_insert_sorted>:
    2250:       48 8d 05 09 00 00 00    lea    0x9(%rip),%rax        # 2260 <Jf_insert_sorted>
    2257:       48 8d 15 ca 29 00 00    lea    0x29ca(%rip),%rdx        # 4c28 <Jfo_insert_sorted+0x10>
    225e:       c3                      ret    

0000000000002260 <Jf_insert_sorted>:
    2260:       55                      push   %rbp
    2261:       41 57                   push   %r15
    2263:       41 56                   push   %r14
    2265:       41 55                   push   %r13
    2267:       41 54                   push   %r12
    2269:       53                      push   %rbx
    226a:       48 83 ec 58             sub    $0x58,%rsp
    226e:       48 39 27                cmp    %rsp,(%rdi)
    2271:       0f 83 e9 fd ff ff       jae    2060 <filc_stack_overflow_failure@plt>
    2277:       48 8b 4f 10             mov    0x10(%rdi),%rcx
    227b:       48 89 4c 24 28          mov    %rcx,0x28(%rsp)
    2280:       48 8d 4c 24 28          lea    0x28(%rsp),%rcx
    2285:       48 89 4f 10             mov    %rcx,0x10(%rdi)
    2289:       48 8d 0d d8 27 00 00    lea    0x27d8(%rip),%rcx        # 4a68 <Jgo_.str+0x58>
    2290:       48 89 4c 24 30          mov    %rcx,0x30(%rsp)
    2295:       0f 57 c0                xorps  %xmm0,%xmm0
    2298:       0f 11 44 24 38          movups %xmm0,0x38(%rsp)
    229d:       0f 11 44 24 48          movups %xmm0,0x48(%rsp)
    22a2:       48 83 fe 0f             cmp    $0xf,%rsi
    22a6:       0f 86 7e 04 00 00       jbe    272a <Jf_insert_sorted+0x4ca>
    22ac:       48 89 fb                mov    %rdi,%rbx
    22af:       4c 8b af 80 00 00 00    mov    0x80(%rdi),%r13
    22b6:       8b 87 88 00 00 00       mov    0x88(%rdi),%eax
    22bc:       89 44 24 08             mov    %eax,0x8(%rsp)
    22c0:       4c 8b a7 80 01 00 00    mov    0x180(%rdi),%r12
    22c7:       48 8d 35 02 29 00 00    lea    0x2902(%rip),%rsi        # 4bd0 <Jgo_.str+0x1c0>
    22ce:       e8 ed fd ff ff          call   20c0 <pizlonated_malloc@plt>
    22d3:       48 c7 83 80 00 00 00    movq   $0x10,0x80(%rbx)
    22da:       10 00 00 00 
    22de:       48 c7 83 80 01 00 00    movq   $0x0,0x180(%rbx)
    22e5:       00 00 00 00 
    22e9:       48 8d 0d 18 29 00 00    lea    0x2918(%rip),%rcx        # 4c08 <Jgo_.str+0x1f8>
    22f0:       48 89 4c 24 30          mov    %rcx,0x30(%rsp)
    22f5:       48 85 d2                test   %rdx,%rdx
    22f8:       0f 84 21 04 00 00       je     271f <Jf_insert_sorted+0x4bf>
    22fe:       48 8b 4a f8             mov    -0x8(%rdx),%rcx
    2302:       48 be 00 00 00 00 00    movabs $0x3c0000000000000,%rsi
    2309:       00 c0 03 
    230c:       48 21 ce                and    %rcx,%rsi
    230f:       48 bf 00 00 00 00 00    movabs $0x40000000000000,%rdi
    2316:       00 40 00 
    2319:       48 39 fe                cmp    %rdi,%rsi
    231c:       0f 85 fd 03 00 00       jne    271f <Jf_insert_sorted+0x4bf>
    2322:       48 be ff ff ff ff ff    movabs $0xffffffffffff,%rsi
    2329:       ff 00 00 
    232c:       48 21 f1                and    %rsi,%rcx
    232f:       48 39 c8                cmp    %rcx,%rax
    2332:       0f 85 e7 03 00 00       jne    271f <Jf_insert_sorted+0x4bf>
    2338:       be 08 00 00 00          mov    $0x8,%esi
    233d:       48 89 df                mov    %rbx,%rdi
    2340:       ff d0                   call   *%rax
    2342:       a8 01                   test   $0x1,%al
    2344:       0f 85 00 03 00 00       jne    264a <Jf_insert_sorted+0x3ea>
    234a:       48 83 fa 07             cmp    $0x7,%rdx
    234e:       0f 86 ed 03 00 00       jbe    2741 <Jf_insert_sorted+0x4e1>
    2354:       48 8b bb 80 00 00 00    mov    0x80(%rbx),%rdi
    235b:       48 8b b3 80 01 00 00    mov    0x180(%rbx),%rsi
    2362:       48 89 74 24 38          mov    %rsi,0x38(%rsp)
    2367:       48 85 f6                test   %rsi,%rsi
    236a:       44 8b 54 24 08          mov    0x8(%rsp),%r10d
    236f:       0f 84 b9 02 00 00       je     262e <Jf_insert_sorted+0x3ce>
    2375:       f6 46 fe 06             testb  $0x6,-0x2(%rsi)
    2379:       0f 85 af 02 00 00       jne    262e <Jf_insert_sorted+0x3ce>
    237f:       48 8d 47 08             lea    0x8(%rdi),%rax
    2383:       48 39 f0                cmp    %rsi,%rax
    2386:       0f 82 a2 02 00 00       jb     262e <Jf_insert_sorted+0x3ce>
    238c:       48 8b 4e f0             mov    -0x10(%rsi),%rcx
    2390:       48 83 c1 fc             add    $0xfffffffffffffffc,%rcx
    2394:       48 39 c8                cmp    %rcx,%rax
    2397:       0f 87 91 02 00 00       ja     262e <Jf_insert_sorted+0x3ce>
    239d:       44 89 10                mov    %r10d,(%rax)
    23a0:       4c 89 64 24 40          mov    %r12,0x40(%rsp)
    23a5:       4d 85 e4                test   %r12,%r12
    23a8:       0f 84 55 02 00 00       je     2603 <Jf_insert_sorted+0x3a3>
    23ae:       49 89 f8                mov    %rdi,%r8
    23b1:       4c 8d 4e f0             lea    -0x10(%rsi),%r9
    23b5:       4c 8d 1d 9c 26 00 00    lea    0x269c(%rip),%r11        # 4a58 <Jgo_.str+0x48>
    23bc:       48 ba ff ff ff ff ff    movabs $0xffffffffffff,%rdx
    23c3:       ff 00 00 
    23c6:       41 f6 c5 07             test   $0x7,%r13b
    23ca:       0f 85 33 02 00 00       jne    2603 <Jf_insert_sorted+0x3a3>
    23d0:       4c 89 ed                mov    %r13,%rbp
    23d3:       4c 29 e5                sub    %r12,%rbp
    23d6:       0f 82 27 02 00 00       jb     2603 <Jf_insert_sorted+0x3a3>
    23dc:       4d 3b 6c 24 f0          cmp    -0x10(%r12),%r13
    23e1:       0f 83 1c 02 00 00       jae    2603 <Jf_insert_sorted+0x3a3>
    23e7:       49 8b 44 24 f8          mov    -0x8(%r12),%rax
    23ec:       48 21 d0                and    %rdx,%rax
    23ef:       0f 84 ba 00 00 00       je     24af <Jf_insert_sorted+0x24f>
    23f5:       4c 8b 3c 28             mov    (%rax,%rbp,1),%r15
    23f9:       41 f6 c7 01             test   $0x1,%r15b
    23fd:       75 67                   jne    2466 <Jf_insert_sorted+0x206>
    23ff:       4d 8b 75 00             mov    0x0(%r13),%r14
    2403:       4c 89 7c 24 48          mov    %r15,0x48(%rsp)
    2408:       4d 85 f6                test   %r14,%r14
    240b:       0f 84 bd 00 00 00       je     24ce <Jf_insert_sorted+0x26e>
    2411:       4d 85 ff                test   %r15,%r15
    2414:       0f 84 fb 01 00 00       je     2615 <Jf_insert_sorted+0x3b5>
    241a:       49 8d 46 08             lea    0x8(%r14),%rax
    241e:       4c 39 f8                cmp    %r15,%rax
    2421:       0f 82 ee 01 00 00       jb     2615 <Jf_insert_sorted+0x3b5>
    2427:       49 8b 4f f0             mov    -0x10(%r15),%rcx
    242b:       48 83 c1 fc             add    $0xfffffffffffffffc,%rcx
    242f:       48 39 c8                cmp    %rcx,%rax
    2432:       0f 87 dd 01 00 00       ja     2615 <Jf_insert_sorted+0x3b5>
    2438:       4c 89 7c 24 50          mov    %r15,0x50(%rsp)
    243d:       44 39 10                cmp    %r10d,(%rax)
    2440:       0f 8f 2f 01 00 00       jg     2575 <Jf_insert_sorted+0x315>
    2446:       f6 43 08 0e             testb  $0xe,0x8(%rbx)
    244a:       75 24                   jne    2470 <Jf_insert_sorted+0x210>
    244c:       4c 89 7c 24 40          mov    %r15,0x40(%rsp)
    2451:       4d 89 f5                mov    %r14,%r13
    2454:       4d 89 fc                mov    %r15,%r12
    2457:       41 f6 c5 07             test   $0x7,%r13b
    245b:       0f 84 6f ff ff ff       je     23d0 <Jf_insert_sorted+0x170>
    2461:       e9 9d 01 00 00          jmp    2603 <Jf_insert_sorted+0x3a3>
    2466:       49 83 e7 fe             and    $0xfffffffffffffffe,%r15
    246a:       4d 8b 7f 08             mov    0x8(%r15),%r15
    246e:       eb 8f                   jmp    23ff <Jf_insert_sorted+0x19f>
    2470:       48 89 7c 24 10          mov    %rdi,0x10(%rsp)
    2475:       48 89 df                mov    %rbx,%rdi
    2478:       49 89 f4                mov    %rsi,%r12
    247b:       4c 89 de                mov    %r11,%rsi
    247e:       4c 89 c5                mov    %r8,%rbp
    2481:       4d 89 cd                mov    %r9,%r13
    2484:       e8 97 fb ff ff          call   2020 <filc_pollcheck_slow@plt>
    2489:       4c 8d 1d c8 25 00 00    lea    0x25c8(%rip),%r11        # 4a58 <Jgo_.str+0x48>
    2490:       44 8b 54 24 08          mov    0x8(%rsp),%r10d
    2495:       4d 89 e9                mov    %r13,%r9
    2498:       49 89 e8                mov    %rbp,%r8
    249b:       48 8b 7c 24 10          mov    0x10(%rsp),%rdi
    24a0:       48 ba ff ff ff ff ff    movabs $0xffffffffffff,%rdx
    24a7:       ff 00 00 
    24aa:       4c 89 e6                mov    %r12,%rsi
    24ad:       eb 9d                   jmp    244c <Jf_insert_sorted+0x1ec>
    24af:       4d 8b 75 00             mov    0x0(%r13),%r14
    24b3:       48 c7 44 24 48 00 00    movq   $0x0,0x48(%rsp)
    24ba:       00 00 
    24bc:       31 c9                   xor    %ecx,%ecx
    24be:       41 bf 00 00 00 00       mov    $0x0,%r15d
    24c4:       4d 85 f6                test   %r14,%r14
    24c7:       74 08                   je     24d1 <Jf_insert_sorted+0x271>
    24c9:       e9 4a 01 00 00          jmp    2618 <Jf_insert_sorted+0x3b8>
    24ce:       4c 89 f9                mov    %r15,%rcx
    24d1:       41 f6 c0 07             test   $0x7,%r8b
    24d5:       0f 85 63 01 00 00       jne    263e <Jf_insert_sorted+0x3de>
    24db:       49 39 f0                cmp    %rsi,%r8
    24de:       0f 82 5a 01 00 00       jb     263e <Jf_insert_sorted+0x3de>
    24e4:       48 8b 46 f8             mov    -0x8(%rsi),%rax
    24e8:       48 0f ba e0 32          bt     $0x32,%rax
    24ed:       0f 82 4b 01 00 00       jb     263e <Jf_insert_sorted+0x3de>
    24f3:       41 f6 44 24 fe 02       testb  $0x2,-0x2(%r12)
    24f9:       0f 85 b9 01 00 00       jne    26b8 <Jf_insert_sorted+0x458>
    24ff:       45 31 f6                xor    %r14d,%r14d
    2502:       48 21 d0                and    %rdx,%rax
    2505:       0f 84 aa 00 00 00       je     25b5 <Jf_insert_sorted+0x355>
    250b:       48 29 f7                sub    %rsi,%rdi
    250e:       4c 8b 0d cb 2a 00 00    mov    0x2acb(%rip),%r9        # 4fe0 <filc_is_marking>
    2515:       4d 85 ff                test   %r15,%r15
    2518:       74 0a                   je     2524 <Jf_insert_sorted+0x2c4>
    251a:       41 80 39 00             cmpb   $0x0,(%r9)
    251e:       0f 85 a0 01 00 00       jne    26c4 <Jf_insert_sorted+0x464>
    2524:       48 89 0c 38             mov    %rcx,(%rax,%rdi,1)
    2528:       4d 89 30                mov    %r14,(%r8)
    252b:       49 23 54 24 f8          and    -0x8(%r12),%rdx
    2530:       0f 84 1d 01 00 00       je     2653 <Jf_insert_sorted+0x3f3>
    2536:       41 80 39 00             cmpb   $0x0,(%r9)
    253a:       0f 85 4d 01 00 00       jne    268d <Jf_insert_sorted+0x42d>
    2540:       48 89 34 2a             mov    %rsi,(%rdx,%rbp,1)
    2544:       4d 89 45 00             mov    %r8,0x0(%r13)
    2548:       4c 89 83 80 00 00 00    mov    %r8,0x80(%rbx)
    254f:       48 89 b3 80 01 00 00    mov    %rsi,0x180(%rbx)
    2556:       ba 08 00 00 00          mov    $0x8,%edx
    255b:       31 c0                   xor    %eax,%eax
    255d:       48 8b 4c 24 28          mov    0x28(%rsp),%rcx
    2562:       48 89 4b 10             mov    %rcx,0x10(%rbx)
    2566:       48 83 c4 58             add    $0x58,%rsp
    256a:       5b                      pop    %rbx
    256b:       41 5c                   pop    %r12
    256d:       41 5d                   pop    %r13
    256f:       41 5e                   pop    %r14
    2571:       41 5f                   pop    %r15
    2573:       5d                      pop    %rbp
    2574:       c3                      ret    
    2575:       41 f6 c0 07             test   $0x7,%r8b
    2579:       0f 85 2d 01 00 00       jne    26ac <Jf_insert_sorted+0x44c>
    257f:       49 39 f0                cmp    %rsi,%r8
    2582:       0f 82 24 01 00 00       jb     26ac <Jf_insert_sorted+0x44c>
    2588:       48 8b 46 f8             mov    -0x8(%rsi),%rax
    258c:       48 0f ba e0 32          bt     $0x32,%rax
    2591:       0f 82 15 01 00 00       jb     26ac <Jf_insert_sorted+0x44c>
    2597:       41 f6 44 24 fe 02       testb  $0x2,-0x2(%r12)
    259d:       0f 85 70 01 00 00       jne    2713 <Jf_insert_sorted+0x4b3>
    25a3:       4c 89 f9                mov    %r15,%rcx
    25a6:       41 bf 01 00 00 00       mov    $0x1,%r15d
    25ac:       48 21 d0                and    %rdx,%rax
    25af:       0f 85 56 ff ff ff       jne    250b <Jf_insert_sorted+0x2ab>
    25b5:       48 89 4c 24 18          mov    %rcx,0x18(%rsp)
    25ba:       48 8d 05 2f 25 00 00    lea    0x252f(%rip),%rax        # 4af0 <Jgo_.str+0xe0>
    25c1:       48 89 44 24 30          mov    %rax,0x30(%rsp)
    25c6:       48 89 7c 24 10          mov    %rdi,0x10(%rsp)
    25cb:       48 89 df                mov    %rbx,%rdi
    25ce:       48 89 74 24 08          mov    %rsi,0x8(%rsp)
    25d3:       4c 89 ce                mov    %r9,%rsi
    25d6:       4c 89 44 24 20          mov    %r8,0x20(%rsp)
    25db:       e8 20 fb ff ff          call   2100 <filc_object_ensure_aux_ptr_outline@plt>
    25e0:       4c 8b 44 24 20          mov    0x20(%rsp),%r8
    25e5:       48 8b 7c 24 10          mov    0x10(%rsp),%rdi
    25ea:       48 ba ff ff ff ff ff    movabs $0xffffffffffff,%rdx
    25f1:       ff 00 00 
    25f4:       48 8b 74 24 08          mov    0x8(%rsp),%rsi
    25f9:       48 8b 4c 24 18          mov    0x18(%rsp),%rcx
    25fe:       e9 08 ff ff ff          jmp    250b <Jf_insert_sorted+0x2ab>
    2603:       48 8d 15 c6 24 00 00    lea    0x24c6(%rip),%rdx        # 4ad0 <Jgo_.str+0xc0>
    260a:       4c 89 ef                mov    %r13,%rdi
    260d:       4c 89 e6                mov    %r12,%rsi
    2610:       e8 db fa ff ff          call   20f0 <filc_optimized_access_check_fail@plt>
    2615:       4c 89 f9                mov    %r15,%rcx
    2618:       49 83 c6 08             add    $0x8,%r14
    261c:       48 8d 15 4d 25 00 00    lea    0x254d(%rip),%rdx        # 4b70 <Jgo_.str+0x160>
    2623:       4c 89 f7                mov    %r14,%rdi
    2626:       48 89 ce                mov    %rcx,%rsi
    2629:       e8 c2 fa ff ff          call   20f0 <filc_optimized_access_check_fail@plt>
    262e:       48 83 c7 08             add    $0x8,%rdi
    2632:       48 8d 15 67 24 00 00    lea    0x2467(%rip),%rdx        # 4aa0 <Jgo_.str+0x90>
    2639:       e8 b2 fa ff ff          call   20f0 <filc_optimized_access_check_fail@plt>
    263e:       48 8d 15 cb 24 00 00    lea    0x24cb(%rip),%rdx        # 4b10 <Jgo_.str+0x100>
    2645:       e8 a6 fa ff ff          call   20f0 <filc_optimized_access_check_fail@plt>
    264a:       b0 01                   mov    $0x1,%al
    264c:       31 d2                   xor    %edx,%edx
    264e:       e9 0a ff ff ff          jmp    255d <Jf_insert_sorted+0x2fd>
    2653:       49 83 c4 f0             add    $0xfffffffffffffff0,%r12
    2657:       48 8d 05 d2 24 00 00    lea    0x24d2(%rip),%rax        # 4b30 <Jgo_.str+0x120>
    265e:       48 89 44 24 30          mov    %rax,0x30(%rsp)
    2663:       48 89 df                mov    %rbx,%rdi
    2666:       49 89 f6                mov    %rsi,%r14
    2669:       4c 89 e6                mov    %r12,%rsi
    266c:       4d 89 c7                mov    %r8,%r15
    266f:       4d 89 cc                mov    %r9,%r12
    2672:       e8 89 fa ff ff          call   2100 <filc_object_ensure_aux_ptr_outline@plt>
    2677:       4d 89 e1                mov    %r12,%r9
    267a:       4d 89 f8                mov    %r15,%r8
    267d:       4c 89 f6                mov    %r14,%rsi
    2680:       48 89 c2                mov    %rax,%rdx
    2683:       41 80 39 00             cmpb   $0x0,(%r9)
    2687:       0f 84 b3 fe ff ff       je     2540 <Jf_insert_sorted+0x2e0>
    268d:       48 89 df                mov    %rbx,%rdi
    2690:       49 89 f6                mov    %rsi,%r14
    2693:       49 89 d7                mov    %rdx,%r15
    2696:       4d 89 c4                mov    %r8,%r12
    2699:       e8 02 fa ff ff          call   20a0 <filc_store_barrier_for_lower_slow@plt>
    269e:       4d 89 e0                mov    %r12,%r8
    26a1:       4c 89 fa                mov    %r15,%rdx
    26a4:       4c 89 f6                mov    %r14,%rsi
    26a7:       e9 94 fe ff ff          jmp    2540 <Jf_insert_sorted+0x2e0>
    26ac:       48 8d 15 dd 24 00 00    lea    0x24dd(%rip),%rdx        # 4b90 <Jgo_.str+0x180>
    26b3:       e8 38 fa ff ff          call   20f0 <filc_optimized_access_check_fail@plt>
    26b8:       48 8d 15 81 24 00 00    lea    0x2481(%rip),%rdx        # 4b40 <Jgo_.str+0x130>
    26bf:       e9 46 ff ff ff          jmp    260a <Jf_insert_sorted+0x3aa>
    26c4:       48 89 7c 24 10          mov    %rdi,0x10(%rsp)
    26c9:       48 89 df                mov    %rbx,%rdi
    26cc:       48 89 74 24 08          mov    %rsi,0x8(%rsp)
    26d1:       48 89 ce                mov    %rcx,%rsi
    26d4:       4c 89 44 24 20          mov    %r8,0x20(%rsp)
    26d9:       48 89 4c 24 18          mov    %rcx,0x18(%rsp)
    26de:       49 89 c7                mov    %rax,%r15
    26e1:       e8 ba f9 ff ff          call   20a0 <filc_store_barrier_for_lower_slow@plt>
    26e6:       4c 8b 0d f3 28 00 00    mov    0x28f3(%rip),%r9        # 4fe0 <filc_is_marking>
    26ed:       4c 89 f8                mov    %r15,%rax
    26f0:       48 8b 4c 24 18          mov    0x18(%rsp),%rcx
    26f5:       4c 8b 44 24 20          mov    0x20(%rsp),%r8
    26fa:       48 8b 7c 24 10          mov    0x10(%rsp),%rdi
    26ff:       48 ba ff ff ff ff ff    movabs $0xffffffffffff,%rdx
    2706:       ff 00 00 
    2709:       48 8b 74 24 08          mov    0x8(%rsp),%rsi
    270e:       e9 11 fe ff ff          jmp    2524 <Jf_insert_sorted+0x2c4>
    2713:       48 8d 15 96 24 00 00    lea    0x2496(%rip),%rdx        # 4bb0 <Jgo_.str+0x1a0>
    271a:       e9 eb fe ff ff          jmp    260a <Jf_insert_sorted+0x3aa>
    271f:       48 89 c7                mov    %rax,%rdi
    2722:       48 89 d6                mov    %rdx,%rsi
    2725:       e8 b6 f9 ff ff          call   20e0 <filc_check_function_call_fail@plt>
    272a:       48 89 f0                mov    %rsi,%rax
    272d:       48 8d 15 34 23 00 00    lea    0x2334(%rip),%rdx        # 4a68 <Jgo_.str+0x58>
    2734:       be 10 00 00 00          mov    $0x10,%esi
    2739:       48 89 c7                mov    %rax,%rdi
    273c:       e8 ff f8 ff ff          call   2040 <filc_cc_args_check_failure@plt>
    2741:       48 8d 05 88 24 00 00    lea    0x2488(%rip),%rax        # 4bd0 <Jgo_.str+0x1c0>
    2748:       be 08 00 00 00          mov    $0x8,%esi
    274d:       48 89 d7                mov    %rdx,%rdi
    2750:       48 89 c2                mov    %rax,%rdx
    2753:       e8 78 f9 ff ff          call   20d0 <filc_cc_rets_check_failure@plt>

Prologue

Let's begin! First, the linker thunk:

0000000000002250 <pizlonated_insert_sorted>:
    2250:       48 8d 05 09 00 00 00    lea    0x9(%rip),%rax        # 2260 <Jf_insert_sorted>
    2257:       48 8d 15 ca 29 00 00    lea    0x29ca(%rip),%rdx        # 4c28 <Jfo_insert_sorted+0x10>
    225e:       c3                      ret

Fil-C has memory-safe linking and puts all external symbols in the pizlonated_ namespace. Each linker symbol (whether code or data) is a function that returns a flight pointer (a tuple of the pointer's capability pointer and the pointer's valkue) to the global. Here, we're returning Jf_insert_sorted - the actual implementation - as the pointer's value, and a pointer to a readonly internal global that represents the capability for that function. Jf_insert_sorted is an internal function (the linker doesn't link calls to it).

Next, the prologue of Jf_insert_sorted, which is the implementation of insert_sorted:

    2260:       55                      push   %rbp
    2261:       41 57                   push   %r15
    2263:       41 56                   push   %r14
    2265:       41 55                   push   %r13
    2267:       41 54                   push   %r12
    2269:       53                      push   %rbx
    226a:       48 83 ec 58             sub    $0x58,%rsp
    226e:       48 39 27                cmp    %rsp,(%rdi)
    2271:       0f 83 e9 fd ff ff       jae    2060 <filc_stack_overflow_failure@plt>

We're saving a bunch of callee save registers using push, then setting up some stack space by subing from %rsp (the stack pointer).

The last two instructions - the cmp and jae - are checking if we've overflowed the stack. Here, %rdi is the first argument in the normal X86_64 calling convention, which Fil-C uses to pass around the thread pointer. (Issue 16: Turn the thread pointer into a proper pinned register). The full signature of a pizlonated function is:

struct pizlonated_return_value {
    bool has_exception;
    size_t return_size;
};
typedef pizlonated_return_value (*pizlonated_function)(
    filc_thread* my_thread, size_t argument_size);

The first field in the Fil-C thread object is the stack limit. If we breach that, we jump to the special filc_stack_overflow_failure thunk, which is engineered to work correctly even if we had blown past the end of the stack so thoroughly that %rsp was no longer pointing at a valid page.

Next let's look a the Pizderson frame setup:

    2277:       48 8b 4f 10             mov    0x10(%rdi),%rcx
    227b:       48 89 4c 24 28          mov    %rcx,0x28(%rsp)
    2280:       48 8d 4c 24 28          lea    0x28(%rsp),%rcx
    2285:       48 89 4f 10             mov    %rcx,0x10(%rdi)
    2289:       48 8d 0d d8 27 00 00    lea    0x27d8(%rip),%rcx        # 4a68 <Jgo_.str+0x58>
    2290:       48 89 4c 24 30          mov    %rcx,0x30(%rsp)
    2295:       0f 57 c0                xorps  %xmm0,%xmm0
    2298:       0f 11 44 24 38          movups %xmm0,0x38(%rsp)
    229d:       0f 11 44 24 48          movups %xmm0,0x48(%rsp)

Pizderson frames allow for not-too-hilariously-expensive accurate GC with minimal compiler support and a dead-simple runtime interface (i.e. how the runtime scans the stack). Pizderson frames are like Henderson frames except they only support non-moving GC, which means that pointers in locals can still be register-allocated. (Issue 17: Implement real accurate stack scanning)

Here's the struct layout of a Fil-C Pizderson frame:

struct filc_frame {
    filc_frame* parent;
    const filc_origin* origin;
    void* lowers[];
};

In this assembly snippet, we're first setting up parent:

    2277:       48 8b 4f 10             mov    0x10(%rdi),%rcx
    227b:       48 89 4c 24 28          mov    %rcx,0x28(%rsp)

The thread pointer (%rdi) has the thread pointer, and offset 0x10 happens to be where the top frame pointer field is. We load that and store it to the parent field of the stack-allocated filc_frame.

Then we store the address of the Pizderson frame to the top frame pointer in the thread:

    2280:       48 8d 4c 24 28          lea    0x28(%rsp),%rcx
    2285:       48 89 4f 10             mov    %rcx,0x10(%rdi)

And then we store the origin (compiler-generated metadata about the a specific point in the code).

    2289:       48 8d 0d d8 27 00 00    lea    0x27d8(%rip),%rcx        # 4a68 <Jgo_.str+0x58>
    2290:       48 89 4c 24 30          mov    %rcx,0x30(%rsp)

Note that we'll store other origins into this field. We'll do something like this store at every point in the code where we perform a call (either to the Fil-C runtime or to another pizlonated function). The origin also contains information about this function, such as the length of the lowers array.

Finally, we initialize the lowers.

    2295:       0f 57 c0                xorps  %xmm0,%xmm0
    2298:       0f 11 44 24 38          movups %xmm0,0x38(%rsp)
    229d:       0f 11 44 24 48          movups %xmm0,0x48(%rsp)

In Fil-C runtime and compiler jargon, the lower is a particular representation of the capability, where we point just above the filc_object (i.e. the capability object), which also happens to be the lower bounds. If you see accesses at negative constant offsets as big as -16, then those are likely to be filc_object accesses. Since this function doesn't yet have any pointers that need to be tracked by GC, this is just initialized to zero. Whenever a pointer value is produced, it'll be stored into somewhere in this lowers array. (Issue 18: Optimize how the compiler emits stores to the filc_frame lowers array)

Next, we process insert_sorted's arguments, i.e. struct list** list_head, int value:

    22a2:       48 83 fe 0f             cmp    $0xf,%rsi
    22a6:       0f 86 7e 04 00 00       jbe    272a <Jf_insert_sorted+0x4ca>
    22ac:       48 89 fb                mov    %rdi,%rbx
    22af:       4c 8b af 80 00 00 00    mov    0x80(%rdi),%r13
    22b6:       8b 87 88 00 00 00       mov    0x88(%rdi),%eax
    22bc:       89 44 24 08             mov    %eax,0x8(%rsp)
    22c0:       4c 8b a7 80 01 00 00    mov    0x180(%rdi),%r12

Recall that in the Fil-C calling convention, the second argument (%rsi) is the "argument size":

typedef pizlonated_return_value (*pizlonated_function)(
    filc_thread* my_thread, size_t argument_size);

This is the number of bytes worth of arguments being passed. Arguments are passed in a thread-local buffer that starts at offset 0x80 from the thread pointer. Each primitive argument is passed using its corresponding 64-bit "canonical" representation (so an int16_t would be passed as a int64_t and a float would be passed as a double). Hence, insert_sorted takes 16 bytes worth of arguments, and this code is branching away with a jbe if the argument size is <= 0xf.

Next it looks like we're copying the thread pointer into a callee-save register:

    22ac:       48 89 fb                mov    %rdi,%rbx

And finally we're loading the arguments:

    22af:       4c 8b af 80 00 00 00    mov    0x80(%rdi),%r13
    22b6:       8b 87 88 00 00 00       mov    0x88(%rdi),%eax
    22bc:       89 44 24 08             mov    %eax,0x8(%rsp)
    22c0:       4c 8b a7 80 01 00 00    mov    0x180(%rdi),%r12

Seems like one of the arguments (int value) is immediately spilled to 0x8(%rsp). Notice that there are three loads total. This is because there is a second buffer for the arguments' capabilities. Since struct list** list_head is a pointer, we load its lower from that buffer (at offset 0x180).

Fil-C's calling convention allows for maximally sloppy function pointer casts. If the callsite's signature disagrees with the callee's signature, then stuff just tends to work. It's not clear that this is a requirement of Fil-C, and even if it was, this thread-local buffer for arguments (and return values, as we'll see later) is hilariously inefficient. (Issue 22: The Fil-C calling convention should be optimized)

Calling `malloc`

Next we get the pointer to malloc:

    22c7:       48 8d 35 02 29 00 00    lea    0x2902(%rip),%rsi        # 4bd0 <Jgo_.str+0x1c0>
    22ce:       e8 ed fd ff ff          call   20c0 <pizlonated_malloc@plt>

The pizlonated_malloc function just returns the pointer to malloc along with its capability. These are now in %rax (the pointer) and %rdx (its capability; specifically, its lower).

The signature of a pizlonated getter is:

typedef filc_ptr (*pizlonated_getter)(
    filc_thread* my_thread, const filc_origin* passed_origin);

In this call to pizlonated_malloc, we still had the thread pointer in %rdi, so that got passed as the first argument. And we materialized an origin and passed it as the passed_origin argument via %rsi. This is in case the getter has to be able to print a stack trace, which could happen if we're calling an ifunc (yes, Fil-C supports ifuncs and they are memory-safe).

Now we set up the arguments to the call to malloc. Remember, we're doing:

    struct list* node = (struct list*)malloc(sizeof(struct list));

sizeof(struct list) is 16 bytes. This is the code that passes the argument:

    22d3:       48 c7 83 80 00 00 00    movq   $0x10,0x80(%rbx)
    22da:       10 00 00 00 
    22de:       48 c7 83 80 01 00 00    movq   $0x0,0x180(%rbx)
    22e5:       00 00 00 00

We're storing 16 as the argument's value, and zero as the argument's lower (since it's not a pointer). This ensures that if malloc was really a void* (*)(void* whatever), then any accesses on whatever would fail due to the null capability check.

Next we store the malloc callsite's origin to our Pizderson frame:

    22e9:       48 8d 0d 18 29 00 00    lea    0x2918(%rip),%rcx        # 4c08 <Jgo_.str+0x1f8>
    22f0:       48 89 4c 24 30          mov    %rcx,0x30(%rsp)

Then we do the function call check. This checks that the function pointer we're about to call is really pointing at a function:

    22f5:       48 85 d2                test   %rdx,%rdx
    22f8:       0f 84 21 04 00 00       je     271f <Jf_insert_sorted+0x4bf>
    22fe:       48 8b 4a f8             mov    -0x8(%rdx),%rcx
    2302:       48 be 00 00 00 00 00    movabs $0x3c0000000000000,%rsi
    2309:       00 c0 03 
    230c:       48 21 ce                and    %rcx,%rsi
    230f:       48 bf 00 00 00 00 00    movabs $0x40000000000000,%rdi
    2316:       00 40 00 
    2319:       48 39 fe                cmp    %rdi,%rsi
    231c:       0f 85 fd 03 00 00       jne    271f <Jf_insert_sorted+0x4bf>
    2322:       48 be ff ff ff ff ff    movabs $0xffffffffffff,%rsi
    2329:       ff 00 00 
    232c:       48 21 f1                and    %rsi,%rcx
    232f:       48 39 c8                cmp    %rcx,%rax
    2332:       0f 85 e7 03 00 00       jne    271f <Jf_insert_sorted+0x4bf>

This is a super gross check! (Issue 19: InvisiCaps 2.0: Remove flag bits from the aux ptr, never have null aux pointers, and stop using the low bits of lowers for atomicity tricks).

Let's break it down. First we have to check that we don't have the null capability:

    22f5:       48 85 d2                test   %rdx,%rdx
    22f8:       0f 84 21 04 00 00       je     271f <Jf_insert_sorted+0x4bf>

Then we have to load the aux pointer from the capability:

    22fe:       48 8b 4a f8             mov    -0x8(%rdx),%rcx

This pointer has flags in the high 16 bits. First we're extracing the flag bits of interest to us into %rsi:

    2302:       48 be 00 00 00 00 00    movabs $0x3c0000000000000,%rsi
    2309:       00 c0 03 
    230c:       48 21 ce                and    %rcx,%rsi

And then we're comparing that the flags to the value they would have for a function capability:

    230f:       48 bf 00 00 00 00 00    movabs $0x40000000000000,%rdi
    2316:       00 40 00 
    2319:       48 39 fe                cmp    %rdi,%rsi

Finally, we're masking out the actual aux pointer:

    2322:       48 be ff ff ff ff ff    movabs $0xffffffffffff,%rsi
    2329:       ff 00 00 
    232c:       48 21 f1                and    %rsi,%rcx

And comparing it to the function pointer:

    232f:       48 39 c8                cmp    %rcx,%rax
    2332:       0f 85 e7 03 00 00       jne    271f <Jf_insert_sorted+0x4bf>

This ensures that the function pointer really points at the function. (Issue 20: Function capabilities returned from pizlonated getters are never offset)

Finally we make the call and extract the return value:

    2338:       be 08 00 00 00          mov    $0x8,%esi
    233d:       48 89 df                mov    %rbx,%rdi
    2340:       ff d0                   call   *%rax
    2342:       a8 01                   test   $0x1,%al
    2344:       0f 85 00 03 00 00       jne    264a <Jf_insert_sorted+0x3ea>
    234a:       48 83 fa 07             cmp    $0x7,%rdx
    234e:       0f 86 ed 03 00 00       jbe    2741 <Jf_insert_sorted+0x4e1>
    2354:       48 8b bb 80 00 00 00    mov    0x80(%rbx),%rdi
    235b:       48 8b b3 80 01 00 00    mov    0x180(%rbx),%rsi

The two arguments to any pizlonated function are filc_thread* my_thread, size_t argument_size. Here we pass 8 as the second argument (the argument size, in %esi), since we're only passing one primitive argument. And we pass our thread pointer as the first argument (%rdi). Then we actually call malloc:

    2340:       ff d0                   call   *%rax

And check if it "threw an exception":

    2342:       a8 01                   test   $0x1,%al
    2344:       0f 85 00 03 00 00       jne    264a <Jf_insert_sorted+0x3ea>

Pizlonated functions return the following:

struct pizlonated_return_value {
    bool has_exception; /* returned in %rax */
    size_t return_size; /* returned in %rdx */
};

So, our first check is that malloc isn't "throwing an exception", which really means either:

Phase #2 of unwinding for _Unwind_RaiseException, or
Forced unwinding _Unwind_ForcedUnwind.

All C functions support the latter, so even C code has to defend against this. (Issue 21: Implement Fil-C's unwinding in terms of native unwinding)

After that, we check that it indeed returned a value:

    234a:       48 83 fa 07             cmp    $0x7,%rdx
    234e:       0f 86 ed 03 00 00       jbe    2741 <Jf_insert_sorted+0x4e1>

Specifically, we're branching away if the return size is <= 0x7. And then we finish making the call by loading the returned pointer and its capability from the thread's buffer:

    2354:       48 8b bb 80 00 00 00    mov    0x80(%rbx),%rdi
    235b:       48 8b b3 80 01 00 00    mov    0x180(%rbx),%rsi

The last instruction stores the node into the Pizderson frame:

    2362:       48 89 74 24 38          mov    %rsi,0x38(%rsp)

It's pretty incredible that it takes so much work to call malloc. So much of this could be optimized away. But even separately from general function call optimizations, the compiler could special-case malloc. (Issue 23: Turn malloc calls into direct calls into the GC allocator)

Storing the `value`

Now we're at:

    node->value = value;

This is what it takes to do that store:

    2367:       48 85 f6                test   %rsi,%rsi
    236a:       44 8b 54 24 08          mov    0x8(%rsp),%r10d
    236f:       0f 84 b9 02 00 00       je     262e <Jf_insert_sorted+0x3ce>
    2375:       f6 46 fe 06             testb  $0x6,-0x2(%rsi)
    2379:       0f 85 af 02 00 00       jne    262e <Jf_insert_sorted+0x3ce>
    237f:       48 8d 47 08             lea    0x8(%rdi),%rax
    2383:       48 39 f0                cmp    %rsi,%rax
    2386:       0f 82 a2 02 00 00       jb     262e <Jf_insert_sorted+0x3ce>
    238c:       48 8b 4e f0             mov    -0x10(%rsi),%rcx
    2390:       48 83 c1 fc             add    $0xfffffffffffffffc,%rcx
    2394:       48 39 c8                cmp    %rcx,%rax
    2397:       0f 87 91 02 00 00       ja     262e <Jf_insert_sorted+0x3ce>
    239d:       44 89 10                mov    %r10d,(%rax)

First we are checking that node's capability isn't null:

    2367:       48 85 f6                test   %rsi,%rsi
    236a:       44 8b 54 24 08          mov    0x8(%rsp),%r10d
    236f:       0f 84 b9 02 00 00       je     262e <Jf_insert_sorted+0x3ce>

The compiler is being clever here, and it's interleaving the check with a fill of int value (it had previously been spilled at 0x8(%rsp)).

Then we check that node's capability is not readonly:

    2375:       f6 46 fe 06             testb  $0x6,-0x2(%rsi)
    2379:       0f 85 af 02 00 00       jne    262e <Jf_insert_sorted+0x3ce>

Note that if we fix Issue 23 then we won't have to do most of these checks, since we'll know that the allocator returned a valid object that isn't readonly. But anyways, next we materialize &node->value and check that it's above lower bound:

    237f:       48 8d 47 08             lea    0x8(%rdi),%rax
    2383:       48 39 f0                cmp    %rsi,%rax
    2386:       0f 82 a2 02 00 00       jb     262e <Jf_insert_sorted+0x3ce>

Recall that capabilities are represented by the runtime and compiler using a lower, i.e. the lower bound, which also happens to be a pointer to the top of filc_object. Hence why the flag check was at -0x2(%rsi) and the lower bound is just %rsi.

Then we check that the upper bound minus 4 (sizeof(node->value)) is greater than or equal to &node->value (and if it was below, then we branch away to a check fail).

Finally we store value into node->value:

    239d:       44 89 10                mov    %r10d,(%rax)

The Loop

Now we're at the loop:

    for (next_ptr = list_head; ; next_ptr = &(*next_ptr)->next) {
        if (!*next_ptr || (*next_ptr)->value > value) {
            node->next = *next_ptr;
            *next_ptr = node;
            return node;
        }
    }

But before we get into the loop body, we have the loop's pre-header (the code that executes before we start looping):

    23a0:       4c 89 64 24 40          mov    %r12,0x40(%rsp)
    23a5:       4d 85 e4                test   %r12,%r12
    23a8:       0f 84 55 02 00 00       je     2603 <Jf_insert_sorted+0x3a3>
    23ae:       49 89 f8                mov    %rdi,%r8
    23b1:       4c 8d 4e f0             lea    -0x10(%rsi),%r9
    23b5:       4c 8d 1d 9c 26 00 00    lea    0x269c(%rip),%r11        # 4a58 <Jgo_.str+0x48>
    23bc:       48 ba ff ff ff ff ff    movabs $0xffffffffffff,%rdx
    23c3:       ff 00 00 
    23c6:       41 f6 c5 07             test   $0x7,%r13b
    23ca:       0f 85 33 02 00 00       jne    2603 <Jf_insert_sorted+0x3a3>

The first instruction duplicates the capability of list_head into the Pizderson frame:

    23a0:       4c 89 64 24 40          mov    %r12,0x40(%rsp)

This enables the GC to track the object. Note that this is superfluous, since the Fil-C calling convention requires that arguments are tracked by the caller. (Issue 18)

Next we check that list_head has a valid capability:

    23a5:       4d 85 e4                test   %r12,%r12
    23a8:       0f 84 55 02 00 00       je     2603 <Jf_insert_sorted+0x3a3>

We make a copy of the node pointer:

    23ae:       49 89 f8                mov    %rdi,%r8

And we save the filc_object* (as opposed to the lower) of the node capability into another register:

    23b1:       4c 8d 4e f0             lea    -0x10(%rsi),%r9

These two instructions are almost certainly register allocation goofs on LLVM's part.

Then stash the origin we'll use for the pollcheck that's in the loop (I'll explain that later):

    23b5:       4c 8d 1d 9c 26 00 00    lea    0x269c(%rip),%r11        # 4a58 <Jgo_.str+0x48>

This is totally pointless; we're taking up a perfectly good register for now good reason. (Issue 24: LLVM does a bad job of register allocation for origins)

Then we stash a mask we'll use a lot:

    23bc:       48 ba ff ff ff ff ff    movabs $0xffffffffffff,%rdx
    23c3:       ff 00 00

We wouldn't need this mask if we removed the flags from the aux pointer in InvisiCaps. (Issue 19)

Then we check that list_head is pointer-aligned:

    23c6:       41 f6 c5 07             test   $0x7,%r13b
    23ca:       0f 85 33 02 00 00       jne    2603 <Jf_insert_sorted+0x3a3>

Now we're at the actual loop. Note that location 23d0 is the loop header (we'll branch back here). Note that next_ptr = list_head has implicitly happened (we're reusing the same registers that held list_head's pointer value and capability to be next_ptr's pointer value and capability).

    23d0:       4c 89 ed                mov    %r13,%rbp
    23d3:       4c 29 e5                sub    %r12,%rbp
    23d6:       0f 82 27 02 00 00       jb     2603 <Jf_insert_sorted+0x3a3>
    23dc:       4d 3b 6c 24 f0          cmp    -0x10(%r12),%r13
    23e1:       0f 83 1c 02 00 00       jae    2603 <Jf_insert_sorted+0x3a3>
    23e7:       49 8b 44 24 f8          mov    -0x8(%r12),%rax
    23ec:       48 21 d0                and    %rdx,%rax
    23ef:       0f 84 ba 00 00 00       je     24af <Jf_insert_sorted+0x24f>
    23f5:       4c 8b 3c 28             mov    (%rax,%rbp,1),%r15
    23f9:       41 f6 c7 01             test   $0x1,%r15b
    23fd:       75 67                   jne    2466 <Jf_insert_sorted+0x206>
    23ff:       4d 8b 75 00             mov    0x0(%r13),%r14
    2403:       4c 89 7c 24 48          mov    %r15,0x48(%rsp)
    2408:       4d 85 f6                test   %r14,%r14
    240b:       0f 84 bd 00 00 00       je     24ce <Jf_insert_sorted+0x26e>
    2411:       4d 85 ff                test   %r15,%r15
    2414:       0f 84 fb 01 00 00       je     2615 <Jf_insert_sorted+0x3b5>
    241a:       49 8d 46 08             lea    0x8(%r14),%rax
    241e:       4c 39 f8                cmp    %r15,%rax
    2421:       0f 82 ee 01 00 00       jb     2615 <Jf_insert_sorted+0x3b5>
    2427:       49 8b 4f f0             mov    -0x10(%r15),%rcx
    242b:       48 83 c1 fc             add    $0xfffffffffffffffc,%rcx
    242f:       48 39 c8                cmp    %rcx,%rax
    2432:       0f 87 dd 01 00 00       ja     2615 <Jf_insert_sorted+0x3b5>
    2438:       4c 89 7c 24 50          mov    %r15,0x50(%rsp)
    243d:       44 39 10                cmp    %r10d,(%rax)
    2440:       0f 8f 2f 01 00 00       jg     2575 <Jf_insert_sorted+0x315>
    2446:       f6 43 08 0e             testb  $0xe,0x8(%rbx)
    244a:       75 24                   jne    2470 <Jf_insert_sorted+0x210>
    244c:       4c 89 7c 24 40          mov    %r15,0x40(%rsp)
    2451:       4d 89 f5                mov    %r14,%r13
    2454:       4d 89 fc                mov    %r15,%r12
    2457:       41 f6 c5 07             test   $0x7,%r13b
    245b:       0f 84 6f ff ff ff       je     23d0 <Jf_insert_sorted+0x170>
    2461:       e9 9d 01 00 00          jmp    2603 <Jf_insert_sorted+0x3a3>

The first thing the loop tries to do is to load *next_ptr. This requires a bunch of checks, the first of which is a lower bounds check on next_ptr:

    23d0:       4c 89 ed                mov    %r13,%rbp
    23d3:       4c 29 e5                sub    %r12,%rbp
    23d6:       0f 82 27 02 00 00       jb     2603 <Jf_insert_sorted+0x3a3>

Note that we're doing a sub with a branch instead of a cmp because we're going to need the offset from next_ptr's lower bound to access the invisible capability for *next_ptr. So, we'll be using %rbp again.

Next we do an upper bounds check for next_ptr:

    23dc:       4d 3b 6c 24 f0          cmp    -0x10(%r12),%r13
    23e1:       0f 83 1c 02 00 00       jae    2603 <Jf_insert_sorted+0x3a3>

We do this upper bounds check by directly comparing to the upper bounds, which is enough to prove that next_ptr points to 8 bytes, since by the time we get here we would have always done an alignment check on next_ptr (that it's 8-byte aligned).

Then we load next_ptr's aux pointer and check if it's not null:

    23e7:       49 8b 44 24 f8          mov    -0x8(%r12),%rax
    23ec:       48 21 d0                and    %rdx,%rax
    23ef:       0f 84 ba 00 00 00       je     24af <Jf_insert_sorted+0x24f>

Checking if aux is not null wouldn't be necessary if we fixed InvisiCaps. (Issue 19) We'll need the aux pointer to load *next_ptr's capability.

Next we load the lower for *next_ptr from its aux (%rax is the base of the aux allocation and %rbp is the offset) and check that it doesn't have the atomic box bit set:

    23f5:       4c 8b 3c 28             mov    (%rax,%rbp,1),%r15
    23f9:       41 f6 c7 01             test   $0x1,%r15b
    23fd:       75 67                   jne    2466 <Jf_insert_sorted+0x206>

This check wouldn't be necessary if we fixed InvisiCaps. (Issue 19)

Now we load *next_ptr's pointer value, and stash its capability in the Pizderson frame:

    23ff:       4d 8b 75 00             mov    0x0(%r13),%r14
    2403:       4c 89 7c 24 48          mov    %r15,0x48(%rsp)

Now we check if *next_ptr is NULL:

    2408:       4d 85 f6                test   %r14,%r14
    240b:       0f 84 bd 00 00 00       je     24ce <Jf_insert_sorted+0x26e>

If it's NULL, then we're exiting the loop, since from the compiler's standpoint, this code is not part of the loop (it's after the loop completes):

            node->next = *next_ptr;
            *next_ptr = node;
            return node;

We'll get back to the machine code for this later. Inside the loop, the next thing we check (if we know that *next_ptr is not NULL) is (*next_ptr)->value > value. But that requires loading ->value, which means more safety checks.

The first check we need to do to load (*next_ptr)->value is that *next_ptr's capability is valid:

    2411:       4d 85 ff                test   %r15,%r15
    2414:       0f 84 fb 01 00 00       je     2615 <Jf_insert_sorted+0x3b5>

Then we materialize &(*next_ptr)->value into %rax and check that it's above lower bound:

    241a:       49 8d 46 08             lea    0x8(%r14),%rax
    241e:       4c 39 f8                cmp    %r15,%rax
    2421:       0f 82 ee 01 00 00       jb     2615 <Jf_insert_sorted+0x3b5>

Then we check that the upper bound is at least sizeof(int) above &(*next_ptr)->value:

    2427:       49 8b 4f f0             mov    -0x10(%r15),%rcx
    242b:       48 83 c1 fc             add    $0xfffffffffffffffc,%rcx
    242f:       48 39 c8                cmp    %rcx,%rax
    2432:       0f 87 dd 01 00 00       ja     2615 <Jf_insert_sorted+0x3b5>

It's only at this point that we save the *next_ptr's capability into the Pizderson frame:

    2438:       4c 89 7c 24 50          mov    %r15,0x50(%rsp)

This is not the optimal place to schedule this store, since it's only needed for the pollcheck. (Issue 18)

Finally we check if (*next_ptr)->value > value, and if it is, we exit the loop:

    243d:       44 39 10                cmp    %r10d,(%rax)
    2440:       0f 8f 2f 01 00 00       jg     2575 <Jf_insert_sorted+0x315>

Now we do the pollcheck. Fil-C's GC uses safepointing to synchronize with the mutator, which requires the mutator to periodically execute pollchecks. Loop backedges must have pollchecks. This pollcheck ended up here:

    2446:       f6 43 08 0e             testb  $0xe,0x8(%rbx)
    244a:       75 24                   jne    2470 <Jf_insert_sorted+0x210>

Note that there are lots of ways to optimize this check. (Issue 25: Optimize pollchecks)

Now, &(*next_ptr)->next, which happens to be exactly equal to *next_ptr, becomes the new next_ptr. Here's the code that makes that happen:

    244c:       4c 89 7c 24 40          mov    %r15,0x40(%rsp)
    2451:       4d 89 f5                mov    %r14,%r13
    2454:       4d 89 fc                mov    %r15,%r12

Note that we're again saving *next_ptr (i.e. the new next_ptr) into the Pizderson frame, which is dumb. (Issue 18)

Recall that at the top of the loop, next_ptr was in %r13 (the pointer value) and %r12 (the capability).

Finally we're doing a branch back to the top of the loop, but only if next_ptr is pointer-aligned. Otherwise, we branch away to code that handles that failure.

    2457:       41 f6 c5 07             test   $0x7,%r13b
    245b:       0f 84 6f ff ff ff       je     23d0 <Jf_insert_sorted+0x170>
    2461:       e9 9d 01 00 00          jmp    2603 <Jf_insert_sorted+0x3a3>

Slow Paths

This loop has a number of branches that take us to slow paths, and those got scheduled after the loop.

The first slow path is for the case that *next_ptr was an atomic pointer:

    2466:       49 83 e7 fe             and    $0xfffffffffffffffe,%r15
    246a:       4d 8b 7f 08             mov    0x8(%r15),%r15
    246e:       eb 8f                   jmp    23ff <Jf_insert_sorted+0x19f>

This extracts the capability from the atomic box and jumps back into the part of the loop right after the fast-case load of *next_ptr's capability.

Next is the pollcheck slow path:

    2470:       48 89 7c 24 10          mov    %rdi,0x10(%rsp)
    2475:       48 89 df                mov    %rbx,%rdi
    2478:       49 89 f4                mov    %rsi,%r12
    247b:       4c 89 de                mov    %r11,%rsi
    247e:       4c 89 c5                mov    %r8,%rbp
    2481:       4d 89 cd                mov    %r9,%r13
    2484:       e8 97 fb ff ff          call   2020 <filc_pollcheck_slow@plt>
    2489:       4c 8d 1d c8 25 00 00    lea    0x25c8(%rip),%r11        # 4a58 <Jgo_.str+0x48>
    2490:       44 8b 54 24 08          mov    0x8(%rsp),%r10d
    2495:       4d 89 e9                mov    %r13,%r9
    2498:       49 89 e8                mov    %rbp,%r8
    249b:       48 8b 7c 24 10          mov    0x10(%rsp),%rdi
    24a0:       48 ba ff ff ff ff ff    movabs $0xffffffffffff,%rdx
    24a7:       ff 00 00 
    24aa:       4c 89 e6                mov    %r12,%rsi
    24ad:       eb 9d                   jmp    244c <Jf_insert_sorted+0x1ec>

The first instruction here saves %rdi, i.e. the node pointer, to the stack (which is weird, since it's already been replicated to %r8):

    2470:       48 89 7c 24 10          mov    %rdi,0x10(%rsp)

Then we set up the first argument to the pollcheck, which is the thread pointer:

    2475:       48 89 df                mov    %rbx,%rdi

Then we save the lower for node to a callee save:

    2478:       49 89 f4                mov    %rsi,%r12

And we set up the second argument to the pollcheck, which is the origin:

    247b:       4c 89 de                mov    %r11,%rsi

Finally we save %r8 and %r9 (a copy of node and its capability) to callee saves:

    247e:       4c 89 c5                mov    %r8,%rbp
    2481:       4d 89 cd                mov    %r9,%r13

Note that it's silly that this code has to do so much work to save registers. It's likely that the fact that filc_pollcheck_slow clobbers volatile registers also prevents the register allocator from doing everything that it would like, since this save-restore logic is almost certainly due to the greedy register allocator doing live-range-splitting, and that algorithm has a limit on how much of that it will do. So, making slow paths not clobber registers would reduce code size and free up greedy's splitting budget, which could improve register allocation overall. (Issue 27: Slow path calls to the runtime should use a calling convention that preserves all (or most) registers)

Then we call the pollcheck, restore everything clobbered by it, and jump back into the loop:

    2489:       4c 8d 1d c8 25 00 00    lea    0x25c8(%rip),%r11        # 4a58 <Jgo_.str+0x48>
    2490:       44 8b 54 24 08          mov    0x8(%rsp),%r10d
    2495:       4d 89 e9                mov    %r13,%r9
    2498:       49 89 e8                mov    %rbp,%r8
    249b:       48 8b 7c 24 10          mov    0x10(%rsp),%rdi
    24a0:       48 ba ff ff ff ff ff    movabs $0xffffffffffff,%rdx
    24a7:       ff 00 00 
    24aa:       4c 89 e6                mov    %r12,%rsi
    24ad:       eb 9d                   jmp    244c <Jf_insert_sorted+0x1ec>

The next slow path code is for the case that the aux ptr of next_ptr was NULL:

    24af:       4d 8b 75 00             mov    0x0(%r13),%r14
    24b3:       48 c7 44 24 48 00 00    movq   $0x0,0x48(%rsp)
    24ba:       00 00 
    24bc:       31 c9                   xor    %ecx,%ecx
    24be:       41 bf 00 00 00 00       mov    $0x0,%r15d
    24c4:       4d 85 f6                test   %r14,%r14
    24c7:       74 08                   je     24d1 <Jf_insert_sorted+0x271>
    24c9:       e9 4a 01 00 00          jmp    2618 <Jf_insert_sorted+0x3b8>

It's a bug that this path even exists. (Issue 19) Here, we do some of the same work as when we loaded next_ptr's aux pointer. But, we know that the lower is NULL. So, we load the *next_ptr's pointer value:

    24af:       4d 8b 75 00             mov    0x0(%r13),%r14

Then we stash a NULL capability where we would have stashed *next_ptr's capability:

    24b3:       48 c7 44 24 48 00 00    movq   $0x0,0x48(%rsp)
    24ba:       00 00

This next instruction is zeroing *next_ptr's capability pointer, which will be used later (and the code we branch to expects it in both %rcx and %r15):

    24bc:       31 c9                   xor    %ecx,%ecx
    24be:       41 bf 00 00 00 00       mov    $0x0,%r15d

Then we check if *next_ptr is NULL, and if so, we branch to code that is after the loop. This is the same branch we mentioned earlier, just duplicated to this part of the code. If that branch fails, we branch out to a failure path - since we know that it's impossible to load from (*next_ptr)->value if *next_ptr has a NULL capability.

    24c7:       74 08                   je     24d1 <Jf_insert_sorted+0x271>
    24c9:       e9 4a 01 00 00          jmp    2618 <Jf_insert_sorted+0x3b8>

After The Loop

This code is within the loop's scope from the standpoint of C, but it is after the loop from the standpoint of the compiler:

            node->next = *next_ptr;
            *next_ptr = node;
            return node;

Here's the reason: compilers consider a natural loop to be all of the basic blocks backwards-reachable up to its header, from any blocks that branch to the header while being dominated by the header. This code is backwards-rachable from a return statement, so not something that branches to the loop header, therefore it's not part of the loop.

That's likely to be the primary reason why it's scheduled after the loop by the compiler.

What makes this code path most fun to find in the generated code is that there are multiple ways to get to it (next_ptr doesn't have an aux, *next_ptr is NULL, or (*next_ptr)->value > value.

    24ce:       4c 89 f9                mov    %r15,%rcx
    24d1:       41 f6 c0 07             test   $0x7,%r8b
    24d5:       0f 85 63 01 00 00       jne    263e <Jf_insert_sorted+0x3de>
    24db:       49 39 f0                cmp    %rsi,%r8
    24de:       0f 82 5a 01 00 00       jb     263e <Jf_insert_sorted+0x3de>
    24e4:       48 8b 46 f8             mov    -0x8(%rsi),%rax
    24e8:       48 0f ba e0 32          bt     $0x32,%rax
    24ed:       0f 82 4b 01 00 00       jb     263e <Jf_insert_sorted+0x3de>
    24f3:       41 f6 44 24 fe 02       testb  $0x2,-0x2(%r12)
    24f9:       0f 85 b9 01 00 00       jne    26b8 <Jf_insert_sorted+0x458>
    24ff:       45 31 f6                xor    %r14d,%r14d
    2502:       48 21 d0                and    %rdx,%rax
    2505:       0f 84 aa 00 00 00       je     25b5 <Jf_insert_sorted+0x355>
    250b:       48 29 f7                sub    %rsi,%rdi
    250e:       4c 8b 0d cb 2a 00 00    mov    0x2acb(%rip),%r9        # 4fe0 <filc_is_marking>
    2515:       4d 85 ff                test   %r15,%r15
    2518:       74 0a                   je     2524 <Jf_insert_sorted+0x2c4>
    251a:       41 80 39 00             cmpb   $0x0,(%r9)
    251e:       0f 85 a0 01 00 00       jne    26c4 <Jf_insert_sorted+0x464>
    2524:       48 89 0c 38             mov    %rcx,(%rax,%rdi,1)
    2528:       4d 89 30                mov    %r14,(%r8)
    252b:       49 23 54 24 f8          and    -0x8(%r12),%rdx
    2530:       0f 84 1d 01 00 00       je     2653 <Jf_insert_sorted+0x3f3>
    2536:       41 80 39 00             cmpb   $0x0,(%r9)
    253a:       0f 85 4d 01 00 00       jne    268d <Jf_insert_sorted+0x42d>
    2540:       48 89 34 2a             mov    %rsi,(%rdx,%rbp,1)
    2544:       4d 89 45 00             mov    %r8,0x0(%r13)
    2548:       4c 89 83 80 00 00 00    mov    %r8,0x80(%rbx)
    254f:       48 89 b3 80 01 00 00    mov    %rsi,0x180(%rbx)
    2556:       ba 08 00 00 00          mov    $0x8,%edx
    255b:       31 c0                   xor    %eax,%eax
    255d:       48 8b 4c 24 28          mov    0x28(%rsp),%rcx
    2562:       48 89 4b 10             mov    %rcx,0x10(%rbx)
    2566:       48 83 c4 58             add    $0x58,%rsp
    256a:       5b                      pop    %rbx
    256b:       41 5c                   pop    %r12
    256d:       41 5d                   pop    %r13
    256f:       41 5e                   pop    %r14
    2571:       41 5f                   pop    %r15
    2573:       5d                      pop    %rbp
    2574:       c3                      ret    
    2575:       41 f6 c0 07             test   $0x7,%r8b
    2579:       0f 85 2d 01 00 00       jne    26ac <Jf_insert_sorted+0x44c>
    257f:       49 39 f0                cmp    %rsi,%r8
    2582:       0f 82 24 01 00 00       jb     26ac <Jf_insert_sorted+0x44c>
    2588:       48 8b 46 f8             mov    -0x8(%rsi),%rax
    258c:       48 0f ba e0 32          bt     $0x32,%rax
    2591:       0f 82 15 01 00 00       jb     26ac <Jf_insert_sorted+0x44c>
    2597:       41 f6 44 24 fe 02       testb  $0x2,-0x2(%r12)
    259d:       0f 85 70 01 00 00       jne    2713 <Jf_insert_sorted+0x4b3>
    25a3:       4c 89 f9                mov    %r15,%rcx
    25a6:       41 bf 01 00 00 00       mov    $0x1,%r15d
    25ac:       48 21 d0                and    %rdx,%rax
    25af:       0f 85 56 ff ff ff       jne    250b <Jf_insert_sorted+0x2ab>
    25b5:       48 89 4c 24 18          mov    %rcx,0x18(%rsp)
    25ba:       48 8d 05 2f 25 00 00    lea    0x252f(%rip),%rax        # 4af0 <Jgo_.str+0xe0>
    25c1:       48 89 44 24 30          mov    %rax,0x30(%rsp)
    25c6:       48 89 7c 24 10          mov    %rdi,0x10(%rsp)
    25cb:       48 89 df                mov    %rbx,%rdi
    25ce:       48 89 74 24 08          mov    %rsi,0x8(%rsp)
    25d3:       4c 89 ce                mov    %r9,%rsi
    25d6:       4c 89 44 24 20          mov    %r8,0x20(%rsp)
    25db:       e8 20 fb ff ff          call   2100 <filc_object_ensure_aux_ptr_outline@plt>
    25e0:       4c 8b 44 24 20          mov    0x20(%rsp),%r8
    25e5:       48 8b 7c 24 10          mov    0x10(%rsp),%rdi
    25ea:       48 ba ff ff ff ff ff    movabs $0xffffffffffff,%rdx
    25f1:       ff 00 00 
    25f4:       48 8b 74 24 08          mov    0x8(%rsp),%rsi
    25f9:       48 8b 4c 24 18          mov    0x18(%rsp),%rcx
    25fe:       e9 08 ff ff ff          jmp    250b <Jf_insert_sorted+0x2ab>

The first instruction, 24ce, is where we branch to if *next_ptr is NULL, from the normal path of the loop. The second instruction, 24d1, is where we branch to if next_ptr's aux ptr is NULL (and then the only possibility is that *next_ptr is NULL). And 2575 is where we branch to if (*next_ptr)->value > value. This path is special because in this path, we would have loaded, and checked, a capability for *next_ptr. In the other two paths, we would have either not loaded it and inferred it to be NULL, or not checked anything about it.

Let's start with the first two entrypoints, since they're nearly identical:

    24ce:       4c 89 f9                mov    %r15,%rcx
    24d1:       41 f6 c0 07             test   $0x7,%r8b
    24d5:       0f 85 63 01 00 00       jne    263e <Jf_insert_sorted+0x3de>
    24db:       49 39 f0                cmp    %rsi,%r8
    24de:       0f 82 5a 01 00 00       jb     263e <Jf_insert_sorted+0x3de>
    24e4:       48 8b 46 f8             mov    -0x8(%rsi),%rax
    24e8:       48 0f ba e0 32          bt     $0x32,%rax
    24ed:       0f 82 4b 01 00 00       jb     263e <Jf_insert_sorted+0x3de>
    24f3:       41 f6 44 24 fe 02       testb  $0x2,-0x2(%r12)
    24f9:       0f 85 b9 01 00 00       jne    26b8 <Jf_insert_sorted+0x458>
    24ff:       45 31 f6                xor    %r14d,%r14d
    2502:       48 21 d0                and    %rdx,%rax
    2505:       0f 84 aa 00 00 00       je     25b5 <Jf_insert_sorted+0x355>

This code does some of the checks needed to execute:

            node->next = *next_ptr;
            *next_ptr = node;

But under the assumption that *next_ptr's pointer value is NULL. Note, this code hardly exploits this assumption. The registers are:

%rcx has *next_ptr's capability pointer (i.e. lower).
%rsi has node's capability pointer (i.e. lower).
%r8 has node's pointer value.
%r12 is next_ptr's capability.
%r15 also has *next_ptr's capability pointer.

So, we start by checking that node's pointer is pointer-aligned.

    24d1:       41 f6 c0 07             test   $0x7,%r8b
    24d5:       0f 85 63 01 00 00       jne    263e <Jf_insert_sorted+0x3de>

We wouldn't have to check that if we special-cased malloc. (Issue 23)

Then we check that node is above its lower bound.

    24db:       49 39 f0                cmp    %rsi,%r8
    24de:       0f 82 5a 01 00 00       jb     263e <Jf_insert_sorted+0x3de>

And we check if node isn't free.

    24e4:       48 8b 46 f8             mov    -0x8(%rsi),%rax
    24e8:       48 0f ba e0 32          bt     $0x32,%rax
    24ed:       0f 82 4b 01 00 00       jb     263e <Jf_insert_sorted+0x3de>

This is interesting - we could have instead checked the upper bound for &node->next, but at this point we would have already checked the upper bound for &node->value, which is higher. So all we really have to do is that the object hasn't been freed (we are obligated to observe it being freed across pollchecks), which is cheaper than checking the upper bound (it's just a bit test).

Of course, maybe this isn't actually cheaper than checking the upper bounds? (Issue 26: Emitting a is-not-free check instead of an upper bounds check is often a pessimization)

Note that now, %rax holds the aux pointer (including its flags) for node.

Then we check if *next_ptr is writable, or rather that the memory pointed to by next_ptr doesn't have the readonly bit:

    24f3:       41 f6 44 24 fe 02       testb  $0x2,-0x2(%r12)
    24f9:       0f 85 b9 01 00 00       jne    26b8 <Jf_insert_sorted+0x458>

Then we set *next_ptr's value to NULL:

    24ff:       45 31 f6                xor    %r14d,%r14d

This is because we know on this path that *next_ptr must be NULL. Later code will expect *next_ptr value to be in %r14, NULL or not.

Finally we check if node's aux is not NULL (ignoring the high 16 bits):

    2502:       48 21 d0                and    %rdx,%rax
    2505:       0f 84 aa 00 00 00       je     25b5 <Jf_insert_sorted+0x355>

We'll discuss the slow path that this goes to in a little bit. The next code is a branch destination coming from code that handles the (*next_ptr)->value > value case.

    250b:       48 29 f7                sub    %rsi,%rdi
    250e:       4c 8b 0d cb 2a 00 00    mov    0x2acb(%rip),%r9        # 4fe0 <filc_is_marking>
    2515:       4d 85 ff                test   %r15,%r15
    2518:       74 0a                   je     2524 <Jf_insert_sorted+0x2c4>
    251a:       41 80 39 00             cmpb   $0x0,(%r9)
    251e:       0f 85 a0 01 00 00       jne    26c4 <Jf_insert_sorted+0x464>
    2524:       48 89 0c 38             mov    %rcx,(%rax,%rdi,1)
    2528:       4d 89 30                mov    %r14,(%r8)
    252b:       49 23 54 24 f8          and    -0x8(%r12),%rdx
    2530:       0f 84 1d 01 00 00       je     2653 <Jf_insert_sorted+0x3f3>
    2536:       41 80 39 00             cmpb   $0x0,(%r9)
    253a:       0f 85 4d 01 00 00       jne    268d <Jf_insert_sorted+0x42d>
    2540:       48 89 34 2a             mov    %rsi,(%rdx,%rbp,1)
    2544:       4d 89 45 00             mov    %r8,0x0(%r13)
    2548:       4c 89 83 80 00 00 00    mov    %r8,0x80(%rbx)
    254f:       48 89 b3 80 01 00 00    mov    %rsi,0x180(%rbx)
    2556:       ba 08 00 00 00          mov    $0x8,%edx
    255b:       31 c0                   xor    %eax,%eax
    255d:       48 8b 4c 24 28          mov    0x28(%rsp),%rcx
    2562:       48 89 4b 10             mov    %rcx,0x10(%rbx)
    2566:       48 83 c4 58             add    $0x58,%rsp
    256a:       5b                      pop    %rbx
    256b:       41 5c                   pop    %r12
    256d:       41 5d                   pop    %r13
    256f:       41 5e                   pop    %r14
    2571:       41 5f                   pop    %r15
    2573:       5d                      pop    %rbp
    2574:       c3                      ret

Let's review register state:

%rax has the aux pointer of node.
%rbx is the thread pointer.
%rcx has *next_ptr's capability pointer (i.e. lower).
%rdx has the 0xffffffffffff mask (i.e. the low 48 bits mask for aux pointers).
%rbp is the offset of *next_ptr - i.e. the pointer value minus the lower.
%rdi has node's pointer value.
%rsi has node's capability pointer (i.e. lower).
%r8 also has node's pointer value.
%r12 is next_ptr's capability.
%r13 is next_ptr's pointer value.
%r14 has *next_ptr's pointer value.
%r15 also has *next_ptr's capability pointer, but it's only used to check if the capability is not NULL. When we come to this code from the path that handles (*next_ptr)->value > value, %r15 will just be 1 (so that we know that the capability isn't NULL). When we actually use the capability, we'll use %rcx.

Now let's walk through this last bit of code to see what it does. First we compute the offset of the node pointer:

    250b:       48 29 f7                sub    %rsi,%rdi

We'll need this to store to node's aux, when doing node->next = *next_ptr. So, %rdi is no longer node's pointer value. Luckily%r8` also has the original pointer value (and we'll need it).

The next bit is the store barrier for FUGC.

    250e:       4c 8b 0d cb 2a 00 00    mov    0x2acb(%rip),%r9        # 4fe0 <filc_is_marking>
    2515:       4d 85 ff                test   %r15,%r15
    2518:       74 0a                   je     2524 <Jf_insert_sorted+0x2c4>
    251a:       41 80 39 00             cmpb   $0x0,(%r9)
    251e:       0f 85 a0 01 00 00       jne    26c4 <Jf_insert_sorted+0x464>

This is what the barrier looks like in C:

static inline void filc_store_barrier(filc_thread* my_thread, filc_object* target)
{
    if (PAS_UNLIKELY(filc_is_marking) && target)
        filc_store_barrier_slow(my_thread, target);
}

Here, target is %r15, i.e. *next_ptr's capability. First we check that it's not NULL (if it is, we skip the barrier), and then we check if filc_is_marking (if it is, we go to a slow path that calls filc_store_barrier_slow). Note we'll keep using %r9, because we'll have another barrier, so we'll again have to check if filc_is_marking. It's legal to cache this value up to any safepoint (like a call or pollcheck).

Then we can actually do the store for node->next = *next_ptr:

    2524:       48 89 0c 38             mov    %rcx,(%rax,%rdi,1)
    2528:       4d 89 30                mov    %r14,(%r8)

We're storing *next_ptr's capability into node's aux at the offset of node->next, and we're storing *next_ptr's pointer value into node->next itself.

Next we load next_ptr's aux, mask it (%rdx had the mask), and check if it's NULL. If it's NULL, we jump to a slow path.

    252b:       49 23 54 24 f8          and    -0x8(%r12),%rdx
    2530:       0f 84 1d 01 00 00       je     2653 <Jf_insert_sorted+0x3f3>

Now, %rdx is known to have a non-NULL aux pointer (with flags masked off) for next_ptr.

Next we check if we're marking again, because if we are, we'll have to run a barrier with target being node. We know that node has a non-NULL capability already.

    2536:       41 80 39 00             cmpb   $0x0,(%r9)
    253a:       0f 85 4d 01 00 00       jne    268d <Jf_insert_sorted+0x42d>

Finally, we can do the store for *next_ptr = node:

    2540:       48 89 34 2a             mov    %rsi,(%rdx,%rbp,1)
    2544:       4d 89 45 00             mov    %r8,0x0(%r13)

Again, since this is a pointer, we store the capability into the invisible capability location in next_ptr's aux and then we store the pointer value into *next_ptr itself.

Then we set up the return value. This function returns node.

    2548:       4c 89 83 80 00 00 00    mov    %r8,0x80(%rbx)
    254f:       48 89 b3 80 01 00 00    mov    %rsi,0x180(%rbx)
    2556:       ba 08 00 00 00          mov    $0x8,%edx
    255b:       31 c0                   xor    %eax,%eax

This returns 0x8 as the second return value (i.e. the return_size). And it returns false for the first return value (i.e. has_exception).

Finally we tear down the Pizderson frame, by retrieving the current frame's parent and storing it into the thread's top frame:

    255d:       48 8b 4c 24 28          mov    0x28(%rsp),%rcx
    2562:       48 89 4b 10             mov    %rcx,0x10(%rbx)

And then we can actually return:

    2566:       48 83 c4 58             add    $0x58,%rsp
    256a:       5b                      pop    %rbx
    256b:       41 5c                   pop    %r12
    256d:       41 5d                   pop    %r13
    256f:       41 5e                   pop    %r14
    2571:       41 5f                   pop    %r15
    2573:       5d                      pop    %rbp
    2574:       c3                      ret

But that's not all, since so far we've only covered the case where we get to this path if *next_ptr is NULL or if next_ptr's aux ptr is NULL. Here's the code we branch to if next_ptr has a non-NULL aux, *next_ptr is not NULL, and (*next_ptr)->value > value:

    2575:       41 f6 c0 07             test   $0x7,%r8b
    2579:       0f 85 2d 01 00 00       jne    26ac <Jf_insert_sorted+0x44c>
    257f:       49 39 f0                cmp    %rsi,%r8
    2582:       0f 82 24 01 00 00       jb     26ac <Jf_insert_sorted+0x44c>
    2588:       48 8b 46 f8             mov    -0x8(%rsi),%rax
    258c:       48 0f ba e0 32          bt     $0x32,%rax
    2591:       0f 82 15 01 00 00       jb     26ac <Jf_insert_sorted+0x44c>
    2597:       41 f6 44 24 fe 02       testb  $0x2,-0x2(%r12)
    259d:       0f 85 70 01 00 00       jne    2713 <Jf_insert_sorted+0x4b3>
    25a3:       4c 89 f9                mov    %r15,%rcx
    25a6:       41 bf 01 00 00 00       mov    $0x1,%r15d
    25ac:       48 21 d0                and    %rdx,%rax
    25af:       0f 85 56 ff ff ff       jne    250b <Jf_insert_sorted+0x2ab>
    25b5:       48 89 4c 24 18          mov    %rcx,0x18(%rsp)
    25ba:       48 8d 05 2f 25 00 00    lea    0x252f(%rip),%rax        # 4af0 <Jgo_.str+0xe0>
    25c1:       48 89 44 24 30          mov    %rax,0x30(%rsp)
    25c6:       48 89 7c 24 10          mov    %rdi,0x10(%rsp)
    25cb:       48 89 df                mov    %rbx,%rdi
    25ce:       48 89 74 24 08          mov    %rsi,0x8(%rsp)
    25d3:       4c 89 ce                mov    %r9,%rsi
    25d6:       4c 89 44 24 20          mov    %r8,0x20(%rsp)
    25db:       e8 20 fb ff ff          call   2100 <filc_object_ensure_aux_ptr_outline@plt>
    25e0:       4c 8b 44 24 20          mov    0x20(%rsp),%r8
    25e5:       48 8b 7c 24 10          mov    0x10(%rsp),%rdi
    25ea:       48 ba ff ff ff ff ff    movabs $0xffffffffffff,%rdx
    25f1:       ff 00 00 
    25f4:       48 8b 74 24 08          mov    0x8(%rsp),%rsi
    25f9:       48 8b 4c 24 18          mov    0x18(%rsp),%rcx
    25fe:       e9 08 ff ff ff          jmp    250b <Jf_insert_sorted+0x2ab>

This code redoes much of the same checks that we did for the other entrypoints:

    2575:       41 f6 c0 07             test   $0x7,%r8b
    2579:       0f 85 2d 01 00 00       jne    26ac <Jf_insert_sorted+0x44c>
    257f:       49 39 f0                cmp    %rsi,%r8
    2582:       0f 82 24 01 00 00       jb     26ac <Jf_insert_sorted+0x44c>
    2588:       48 8b 46 f8             mov    -0x8(%rsi),%rax
    258c:       48 0f ba e0 32          bt     $0x32,%rax
    2591:       0f 82 15 01 00 00       jb     26ac <Jf_insert_sorted+0x44c>
    2597:       41 f6 44 24 fe 02       testb  $0x2,-0x2(%r12)
    259d:       0f 85 70 01 00 00       jne    2713 <Jf_insert_sorted+0x4b3>
    25a3:       4c 89 f9                mov    %r15,%rcx
    25a6:       41 bf 01 00 00 00       mov    $0x1,%r15d
    25ac:       48 21 d0                and    %rdx,%rax
    25af:       0f 85 56 ff ff ff       jne    250b <Jf_insert_sorted+0x2ab>

First we check that node's pointer is pointer-aligned:

    2575:       41 f6 c0 07             test   $0x7,%r8b
    2579:       0f 85 2d 01 00 00       jne    26ac <Jf_insert_sorted+0x44c>

Then we check that node is above is lower bound:

    257f:       49 39 f0                cmp    %rsi,%r8
    2582:       0f 82 24 01 00 00       jb     26ac <Jf_insert_sorted+0x44c>

Then we check that node isn't free:

    2588:       48 8b 46 f8             mov    -0x8(%rsi),%rax
    258c:       48 0f ba e0 32          bt     $0x32,%rax
    2591:       0f 82 15 01 00 00       jb     26ac <Jf_insert_sorted+0x44c>

And if *next_ptr is writable:

    2597:       41 f6 44 24 fe 02       testb  $0x2,-0x2(%r12)
    259d:       0f 85 70 01 00 00       jne    2713 <Jf_insert_sorted+0x4b3>

Now we make sure that %rcx has a copy of *next_ptr's capability pointer.

    25a3:       4c 89 f9                mov    %r15,%rcx

This next part is goofy; it's an example of how a compiler's complexity often leads to bizarre emergent behavior.

    25a6:       41 bf 01 00 00 00       mov    $0x1,%r15d

Along most paths, %r15 is *next_ptr's capability pointer, and %rcx also has a copy of exactly the same value. We're about to branch to code that will only use %r15 to branch on whether the capability is NULL or not. We actually know already that the capability isn't NULL. So, this sets it to 1. Note that if we removed this instruction, then nothing would change, since %r15 is non-NULL already.

Then we make sure that %rax, which contains the aux pointer for node, is masked so that the high bits are zero (the flags are no longer there so the pointer can be dereferenced). Recall that we had stashed the mask in %rdx already. Upon masking, we branch to 250b - i.e. the code we analyzed previously that finishes the store and returns from insert_sorted if the aux pointer is not NULL.

    25ac:       48 21 d0                and    %rdx,%rax
    25af:       0f 85 56 ff ff ff       jne    250b <Jf_insert_sorted+0x2ab>

If it is NULL, then we fall through to this code, which initializes the aux pointer for node.

    25b5:       48 89 4c 24 18          mov    %rcx,0x18(%rsp)
    25ba:       48 8d 05 2f 25 00 00    lea    0x252f(%rip),%rax        # 4af0 <Jgo_.str+0xe0>
    25c1:       48 89 44 24 30          mov    %rax,0x30(%rsp)
    25c6:       48 89 7c 24 10          mov    %rdi,0x10(%rsp)
    25cb:       48 89 df                mov    %rbx,%rdi
    25ce:       48 89 74 24 08          mov    %rsi,0x8(%rsp)
    25d3:       4c 89 ce                mov    %r9,%rsi
    25d6:       4c 89 44 24 20          mov    %r8,0x20(%rsp)
    25db:       e8 20 fb ff ff          call   2100 <filc_object_ensure_aux_ptr_outline@plt>
    25e0:       4c 8b 44 24 20          mov    0x20(%rsp),%r8
    25e5:       48 8b 7c 24 10          mov    0x10(%rsp),%rdi
    25ea:       48 ba ff ff ff ff ff    movabs $0xffffffffffff,%rdx
    25f1:       ff 00 00 
    25f4:       48 8b 74 24 08          mov    0x8(%rsp),%rsi
    25f9:       48 8b 4c 24 18          mov    0x18(%rsp),%rcx
    25fe:       e9 08 ff ff ff          jmp    250b <Jf_insert_sorted+0x2ab>

Most of this code is saving and restoring registers, which isn't needed if we used the right calling convention for this slow path. (Issue 27)

Let's break down what's happening here. First, we save %rcx since we'll need it after the call returns and %rcx is a volatile register.

    25b5:       48 89 4c 24 18          mov    %rcx,0x18(%rsp)

Then we set up the origin. This is necessary because filc_object_ensure_aux_ptr_outline has safety checks that may lead to the stack being dumped.

    25ba:       48 8d 05 2f 25 00 00    lea    0x252f(%rip),%rax        # 4af0 <Jgo_.str+0xe0>
    25c1:       48 89 44 24 30          mov    %rax,0x30(%rsp)

Then next instructions save volatile registers while also setting up the arguments to filc_object_ensure_aux_ptr_outline:

    25c6:       48 89 7c 24 10          mov    %rdi,0x10(%rsp)
    25cb:       48 89 df                mov    %rbx,%rdi
    25ce:       48 89 74 24 08          mov    %rsi,0x8(%rsp)
    25d3:       4c 89 ce                mov    %r9,%rsi
    25d6:       4c 89 44 24 20          mov    %r8,0x20(%rsp)

The first argument (%rdi) is the thread pointer, and the second argument (%rsi) is node's capability, represented specifically as its filc_object* rather than the lower (which had previously been in %r9).

Now we can make the call, and when it returns, we restore everything we saved and jump back to 250b, which finishes doing the stores and returns from insert_sorted:

    25db:       e8 20 fb ff ff          call   2100 <filc_object_ensure_aux_ptr_outline@plt>
    25e0:       4c 8b 44 24 20          mov    0x20(%rsp),%r8
    25e5:       48 8b 7c 24 10          mov    0x10(%rsp),%rdi
    25ea:       48 ba ff ff ff ff ff    movabs $0xffffffffffff,%rdx
    25f1:       ff 00 00 
    25f4:       48 8b 74 24 08          mov    0x8(%rsp),%rsi
    25f9:       48 8b 4c 24 18          mov    0x18(%rsp),%rcx
    25fe:       e9 08 ff ff ff          jmp    250b <Jf_insert_sorted+0x2ab>

Slow Paths

All that's left at this point are slow paths and check fail paths.

Let's look at a slow path that we branched to in the above code. Specifically, if next_ptr's aux was NULL:

    2653:       49 83 c4 f0             add    $0xfffffffffffffff0,%r12
    2657:       48 8d 05 d2 24 00 00    lea    0x24d2(%rip),%rax        # 4b30 <Jgo_.str+0x120>
    265e:       48 89 44 24 30          mov    %rax,0x30(%rsp)
    2663:       48 89 df                mov    %rbx,%rdi
    2666:       49 89 f6                mov    %rsi,%r14
    2669:       4c 89 e6                mov    %r12,%rsi
    266c:       4d 89 c7                mov    %r8,%r15
    266f:       4d 89 cc                mov    %r9,%r12
    2672:       e8 89 fa ff ff          call   2100 <filc_object_ensure_aux_ptr_outline@plt>
    2677:       4d 89 e1                mov    %r12,%r9
    267a:       4d 89 f8                mov    %r15,%r8
    267d:       4c 89 f6                mov    %r14,%rsi
    2680:       48 89 c2                mov    %rax,%rdx
    2683:       41 80 39 00             cmpb   $0x0,(%r9)
    2687:       0f 84 b3 fe ff ff       je     2540 <Jf_insert_sorted+0x2e0>
    268d:       48 89 df                mov    %rbx,%rdi
    2690:       49 89 f6                mov    %rsi,%r14
    2693:       49 89 d7                mov    %rdx,%r15
    2696:       4d 89 c4                mov    %r8,%r12
    2699:       e8 02 fa ff ff          call   20a0 <filc_store_barrier_for_lower_slow@plt>
    269e:       4d 89 e0                mov    %r12,%r8
    26a1:       4c 89 fa                mov    %r15,%rdx
    26a4:       4c 89 f6                mov    %r14,%rsi
    26a7:       e9 94 fe ff ff          jmp    2540 <Jf_insert_sorted+0x2e0>

Again, we're calling filc_object_ensure_aux_ptr_outline. To do that we need to first get %r12 - next_ptr's capability - to point at the filc_object rather than the lower, so we subtract 16 (sizeof(filc_object)`):

    2653:       49 83 c4 f0             add    $0xfffffffffffffff0,%r12

Then we set up the origin:

    2657:       48 8d 05 d2 24 00 00    lea    0x24d2(%rip),%rax        # 4b30 <Jgo_.str+0x120>
    265e:       48 89 44 24 30          mov    %rax,0x30(%rsp)

And finally we save registers while setting up the first argument to be the thread and the second argument to be next_ptr's object:

    2663:       48 89 df                mov    %rbx,%rdi
    2666:       49 89 f6                mov    %rsi,%r14
    2669:       4c 89 e6                mov    %r12,%rsi
    266c:       4d 89 c7                mov    %r8,%r15
    266f:       4d 89 cc                mov    %r9,%r12
    2672:       e8 89 fa ff ff          call   2100 <filc_object_ensure_aux_ptr_outline@plt>

Then we restore registers:

    2677:       4d 89 e1                mov    %r12,%r9
    267a:       4d 89 f8                mov    %r15,%r8
    267d:       4c 89 f6                mov    %r14,%rsi
    2680:       48 89 c2                mov    %rax,%rdx

We could branch back to the normal path here, but instead, the compiler chose to emit the barrier check immediately.

    2683:       41 80 39 00             cmpb   $0x0,(%r9)
    2687:       0f 84 b3 fe ff ff       je     2540 <Jf_insert_sorted+0x2e0>

If the barrier isn't required, this branches to 2540, which we saw previously - that's where we actually do the store for *next_ptr = node. If the barrier is required, then we fall through to the barrier path for that store:

    268d:       48 89 df                mov    %rbx,%rdi
    2690:       49 89 f6                mov    %rsi,%r14
    2693:       49 89 d7                mov    %rdx,%r15
    2696:       4d 89 c4                mov    %r8,%r12
    2699:       e8 02 fa ff ff          call   20a0 <filc_store_barrier_for_lower_slow@plt>
    269e:       4d 89 e0                mov    %r12,%r8
    26a1:       4c 89 fa                mov    %r15,%rdx
    26a4:       4c 89 f6                mov    %r14,%rsi
    26a7:       e9 94 fe ff ff          jmp    2540 <Jf_insert_sorted+0x2e0>

The signature of filc_store_barrier_for_lower_slow is:

void filc_store_barrier_for_lower_slow(filc_thread* my_thread, void* lower);

So, we're moving the thread pointer into the first argument (%rdi) and the %rsi already had the capability for node.

And then there's another store barrier slow path, this time for node->next = *next_ptr:

    26c4:       48 89 7c 24 10          mov    %rdi,0x10(%rsp)
    26c9:       48 89 df                mov    %rbx,%rdi
    26cc:       48 89 74 24 08          mov    %rsi,0x8(%rsp)
    26d1:       48 89 ce                mov    %rcx,%rsi
    26d4:       4c 89 44 24 20          mov    %r8,0x20(%rsp)
    26d9:       48 89 4c 24 18          mov    %rcx,0x18(%rsp)
    26de:       49 89 c7                mov    %rax,%r15
    26e1:       e8 ba f9 ff ff          call   20a0 <filc_store_barrier_for_lower_slow@plt>
    26e6:       4c 8b 0d f3 28 00 00    mov    0x28f3(%rip),%r9        # 4fe0 <filc_is_marking>
    26ed:       4c 89 f8                mov    %r15,%rax
    26f0:       48 8b 4c 24 18          mov    0x18(%rsp),%rcx
    26f5:       4c 8b 44 24 20          mov    0x20(%rsp),%r8
    26fa:       48 8b 7c 24 10          mov    0x10(%rsp),%rdi
    26ff:       48 ba ff ff ff ff ff    movabs $0xffffffffffff,%rdx
    2706:       ff 00 00 
    2709:       48 8b 74 24 08          mov    0x8(%rsp),%rsi
    270e:       e9 11 fe ff ff          jmp    2524 <Jf_insert_sorted+0x2c4>

Lots of the code for these slow paths would go away if we used the right calling convention for runtime calls. (Issue 27)

Check Failures

This path handles a total of 6 check failures, all to do with next_ptr and list_head:

    2603:       48 8d 15 c6 24 00 00    lea    0x24c6(%rip),%rdx        # 4ad0 <Jgo_.str+0xc0>
    260a:       4c 89 ef                mov    %r13,%rdi
    260d:       4c 89 e6                mov    %r12,%rsi
    2610:       e8 db fa ff ff          call   20f0 <filc_optimized_access_check_fail@plt>

All but one branches to this go to 2603, but one branch here goes to 260a (in that case, a different origin is supplied).

Here's the signature of filc_optimized_access_check_fail:

PAS_NEVER_INLINE PAS_NO_RETURN void filc_optimized_access_check_fail(
    filc_ptr ptr, const filc_optimized_access_check_origin* check_origin);

So, this is a tail call, and the first argument (passed in %rdi and %rsi) is the pointer that caused the failure (next_ptr/list_head here) while the second argument (passed in %rdx) is a compiler-generated constant global data object that provides the runtime with the metadata necessary to describe the failure.

Next up, this code has two entrypoints handling four total failures:

    2615:       4c 89 f9                mov    %r15,%rcx
    2618:       49 83 c6 08             add    $0x8,%r14
    261c:       48 8d 15 4d 25 00 00    lea    0x254d(%rip),%rdx        # 4b70 <Jgo_.str+0x160>
    2623:       4c 89 f7                mov    %r14,%rdi
    2626:       48 89 ce                mov    %rcx,%rsi
    2629:       e8 c2 fa ff ff          call   20f0 <filc_optimized_access_check_fail@plt>

Three branches go to 2615 and one branch goes to 2618. All of them have to do with checking *next_ptr.

Now we handle four different check failures to do with node:

    262e:       48 83 c7 08             add    $0x8,%rdi
    2632:       48 8d 15 67 24 00 00    lea    0x2467(%rip),%rdx        # 4aa0 <Jgo_.str+0x90>
    2639:       e8 b2 fa ff ff          call   20f0 <filc_optimized_access_check_fail@plt>

Now we handle three other check failures for node, this time when storing to node->next:

    263e:       48 8d 15 cb 24 00 00    lea    0x24cb(%rip),%rdx        # 4b10 <Jgo_.str+0x100>
    2645:       e8 a6 fa ff ff          call   20f0 <filc_optimized_access_check_fail@plt>

It's possible that malloc "threw an exception", in which case we just propagate it. If the exception check after the malloc call detects this, we branch here. This sets the return value to has_exception = true and return_size = 0, and then jumps to the return path:

    264a:       b0 01                   mov    $0x1,%al
    264c:       31 d2                   xor    %edx,%edx
    264e:       e9 0a ff ff ff          jmp    255d <Jf_insert_sorted+0x2fd>

This services three different check failures, all to do with node:

    26ac:       48 8d 15 dd 24 00 00    lea    0x24dd(%rip),%rdx        # 4b90 <Jgo_.str+0x180>
    26b3:       e8 38 fa ff ff          call   20f0 <filc_optimized_access_check_fail@plt>

Note that this expects %rdi/%rsi to already have node's pointer and capability.

And then this handles exactly one failure, to do with *next_ptr:

    26b8:       48 8d 15 81 24 00 00    lea    0x2481(%rip),%rdx        # 4b40 <Jgo_.str+0x130>
    26bf:       e9 46 ff ff ff          jmp    260a <Jf_insert_sorted+0x3aa>

We're just setting up the origin and jumping to code that handles the rest of the code for calling filc_optimized_access_check_fail.

The function call to malloc might fail because malloc wasn't a function pointer; those three check failures go here:

    271f:       48 89 c7                mov    %rax,%rdi
    2722:       48 89 d6                mov    %rdx,%rsi
    2725:       e8 b6 f9 ff ff          call   20e0 <filc_check_function_call_fail@plt>

Then there's the case where insert_sorted was not passed enough arguments; that one check goes here:

    272a:       48 89 f0                mov    %rsi,%rax
    272d:       48 8d 15 34 23 00 00    lea    0x2334(%rip),%rdx        # 4a68 <Jgo_.str+0x58>
    2734:       be 10 00 00 00          mov    $0x10,%esi
    2739:       48 89 c7                mov    %rax,%rdi
    273c:       e8 ff f8 ff ff          call   2040 <filc_cc_args_check_failure@plt>

And finally there's the case where malloc didn't return a value; that one check failure goes here:

    2741:       48 8d 05 88 24 00 00    lea    0x2488(%rip),%rax        # 4bd0 <Jgo_.str+0x1c0>
    2748:       be 08 00 00 00          mov    $0x8,%esi
    274d:       48 89 d7                mov    %rdx,%rdi
    2750:       48 89 c2                mov    %rax,%rdx
    2753:       e8 78 f9 ff ff          call   20d0 <filc_cc_rets_check_failure@plt>

Conclusion

Fil-C can compile C in a way that makes the code memory-safe. This post shows how Fil-C is very comprehensive in the checking it emits. Analyzing that code shows lots of room for improvement. I filed the following issues by doing this exercise:

Explanation of Disassembly of a Simple Fil-C Program

Prologue

Calling malloc

Storing the value

The Loop

Slow Paths

After The Loop

Slow Paths

Check Failures

Conclusion

Calling `malloc`

Storing the `value`