18 Jan 2014.

This is my writeup of how system calls work at the assembler level. This article specifically deals with syscalls on Linux on x86_64, but some of the material is more widely applicable.

I apologize for the AT&T assembler syntax, but all the GNU tools default to it. I figured insisting on Intel syntax everywhere would have made the writeup more confusing.

Hello World

Here's a simple "hello world" program:

#include <unistd.h>

int main() {
  const char msg[] = "Hello world!\n";
  write(STDOUT_FILENO, msg, sizeof(msg) - 1);
  return 0;
}

(source code: hello1.c)

Note that sizeof(msg) would return fourteen because the size of the string includes its NUL terminator.

The code above builds and runs as you'd expect:

$ gcc hello1.c
$ ./a.out
Hello world!
$

But how is write() actually implemented?

Like most syscalls, there isn't a write.c in glibc that implements it. Instead, the code is mechanically generated at build time:

$ cd glibc-build-dir
$ make
[...]
(echo '#define SYSCALL_NAME write'; \
   echo '#define SYSCALL_NARGS 3'; \
   echo '#define SYSCALL_SYMBOL __libc_write'; \
   echo '#define SYSCALL_CANCELLABLE 1'; \
   echo '#include <syscall-template.S>'; \
   echo 'weak_alias (__libc_write, __write)'; \
   echo 'libc_hidden_weak (__write)'; \
   echo 'weak_alias (__libc_write, write)'; \
   echo 'libc_hidden_weak (write)'; \
  ) | gcc [lots of args elided...]

It's interesting to read through syscall-template.S and its dependencies, but it's much quicker to just look at the disassembly of the resulting object file:

$ objdump -d io/write.o
[...]
0000000000000009 <__write_nocancel>:
   9: b8 01 00 00 00        mov    $0x1,%eax
   e: 0f 05                 syscall
  10: 48 3d 01 f0 ff ff     cmp    $0xfffffffffffff001,%rax
  16: 0f 83 00 00 00 00     jae    1c <__write_nocancel+0x13>
  1c: c3                    retq

Let's ignore the nocancel and error handling bits for now, and implement our own write() like this:

ssize_t my_write(int fd, const void *buf, size_t count) {
  asm (
      "movl $1, %%eax\n\t"
      "syscall"
      ::: "eax" /* tell GCC the above clobbers the eax register */
      );
}

(source code: hello2.c)

And it works!

The reason it works, in two parts: First, when main() calls my_write(), it uses the x86_64 function call convention where the first six arguments are passed in registers rdi, rsi, rdx, rcx, r8, r9.

Second, when my_write() issues the syscall instruction, the kernel is expecting the syscall number in rax, and six arguments in the registers rdi, rsi, rdx, r10, r8, r9.

So all three of the arguments to the write syscall are already in the correct registers, we just have to set rax.

Some asides:

b8 01 00 00 00        mov    $0x1,%eax
48 c7 c0 01 00 00 00  mov    $0x1,%rax

Here's what main() looks like:

main:
.LFB1:
  .cfi_startproc
  pushq %rbp
  .cfi_def_cfa_offset 16
  .cfi_offset 6, -16
  movq  %rsp, %rbp
  .cfi_def_cfa_register 6
  movl  $13, %edx         # <--
  movl  $msg, %esi        # <--
  movl  $1, %edi          # <--
  call  my_write          # <--
  movl  $0, %eax
  popq  %rbp
  .cfi_def_cfa 7, 8
  ret
  .cfi_endproc

The optimizer

If we turn on optimization, our code no longer works, because my_write() gets inlined into main() and instead of the above, we get:

main:
.LFB1:
  .cfi_startproc
#APP
# 4 "hello2.c" 1
  movl $1, %eax
  syscall
# 0 "" 2
#NO_APP
  movl  $0, %eax
  ret
  .cfi_endproc

The optimizer has helpfully optimized out all the register shuffling between the two functions it merged, so our syscall no longer gets any args!

The way to fix this is to explicitly tell GCC what value needs to be in which register, instead of relying on the machine's function call convention. We do this by specifying "constraints" on our inline assembler block:

ssize_t my_write(int fd, const void *buf, size_t count) {
  int64_t result;
  asm volatile (
      "syscall"
      : /* outputs */
        "=a" (result)
      : /* inputs */
        "a" (__NR_write), // rax = syscall number
        "D" (fd),         // rdi = arg1
        "S" (buf),        // rsi = arg2
        "d" (count)       // rdx = arg3
      : /* clobbers */
        "cc",
        "r11",
        "rcx"
      );
  return 0;
}

(source code: hello3.c)

Now our code works again, and survives inlining and optimization!

Some things to note:

Our optimized main() looks very clean now:

main:
.LFB1:
  .cfi_startproc
  movl  $1, %edi
  movl  $13, %edx
  movl  $msg, %esi
  movl  %edi, %eax
#APP
# 7 "hello3.c" 1
  syscall
# 0 "" 2
#NO_APP
  xorl  %eax, %eax
  ret

And everything except the syscall instruction is coming from GCC's code generator instead of our inline assembler.

errno

We can use our output constraint above to correctly set errno after the write() syscall:

ssize_t my_write(int fd, const void *buf, size_t count) {
  int64_t result;
  asm (
      "syscall"
      : /* outputs */
        "=a" (result)
      : /* inputs */
        "a" (__NR_write), // rax = syscall number
        "D" (fd),         // rdi = arg1
        "S" (buf),        // rsi = arg2
        "d" (count)       // rdx = arg3
      : /* clobbers */
        "cc",
        "r11",
        "rcx"
      );
  if (result >= -4095 && result <= -1) {
    errno = -result;
    return -1;
  } else {
    return result;
  }
}

(source code: hello4.c)

putchar

Let's implement putchar() and make it take an int instead of a char argument like the C89 standard says:

void my_putchar(int c) {
  char ch = c;
  my_write(STDOUT_FILENO, &ch, 1);
}

static const char msg1[] = "This is ";
static const char msg2[] = "broken.\n";

int main() {
  write(STDOUT_FILENO, msg1, sizeof(msg1) - 1);
  my_putchar('n');
  my_putchar('o');
  my_putchar('t');
  my_putchar(' ');
  write(STDOUT_FILENO, msg2, sizeof(msg2) - 1);
  return 0;
}

(source code: putchar1.c)

You can tell what the punch-line is going to be:

$ gcc -O2 putchar1.c
$ ./a.out
This is broken.
$

So what does the generated code look like?

my_putchar:
.LFB1:
        .cfi_startproc
        leaq    -1(%rsp), %rsi
        movl    $1, %edi
        movl    $1, %edx
        movl    %edi, %eax
#APP
# 7 "putchar1.c" 1
        syscall
# 0 "" 2
#NO_APP
        ret

As always, the write is inlined. my_putchar is passed a single argument in rdi and clobbers it. The buffer it passes to the write syscall is the next byte on the stack, but it doesn't set it to anything.

There's a subtle change we need to make to the input constraints for write:

--- putchar1.c  2014-01-19 00:19:23.213002952 +1100
+++ putchar2.c  2014-01-19 00:24:25.697744948 +1100
@@ -12,6 +12,7 @@
         "a" (__NR_write), // rax = syscall number
         "D" (fd),         // rdi = arg1
         "S" (buf),        // rsi = arg2
+        "m" (buf),        // buf has to be a memory location
         "d" (count)       // rdx = arg3
       : /* clobbers */
         "cc",

(source code: putchar2.c)

This gives us a working putchar:

my_putchar:
.LFB1:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movl    $1, %eax
        movl    $1, %edx
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-32, %rsp
        addq    $16, %rsp
        leaq    -80(%rsp), %rsi  # rsi = pointer to variable on stack
        movb    %dil, -80(%rsp)  # write %dil (DL in Intel) to that location
        movq    %rsi, -48(%rsp)
        movl    %eax, %edi
#APP
# 7 "putchar2.c" 1
        syscall
# 0 "" 2
#NO_APP
        leave
        .cfi_def_cfa 7, 8
        ret