Writing In Assembler x86 and aarch64 – Lab3 SP0600

Hello,

In this post, I am talking about how I am furthering my understanding of computers, so I can understand how I can properly optimize software. I am writing about learning how to write assembler code on the x86 and aarch64 platform for my software optimization class lab.

To complete this lab I performed the following tasks :

  1. Build and run the three C versions of the program for x86_64.
    Take a look at the differences in the code.
  2. Use the objdump -d command to dump (print) the object code (machine code) and disassemble it into assembler for each of the binaries. Find the section and take a look at the code. Notice the total amount of code.
  3. Review, build and run the x86_64 assembly language programs. Take a look at the code using objdump -d objectfile and compare it to the source code. Notice the absence of other code (compared to the C binary, which had a lot of extra code).
  4. Build and run the three C versions of the program for aarch64. Verify that you can disassemble the object code in the ELF binary using objdump -d objectfile and take a look at the code
  5. Review, build and run the aarch64 assembly language programs. Take a look at the code using objdump -d objectfile and compare it to the source code.
  6. Make a loop from 0 to 9, on x86 and aarch64
  7. Extend the code to loop from 00-30, printing each value as a 2-digit decimal number, on x86 and aarch64

How I used a Makefile

Since this lab required testing, reviewing, creating and running many files I decided to load everything into a Makefile.

In doing this I learned that I can call Makefiles in other folders.
The way I did that was by adding a target to the main Makefile and typing in “cd /route/to/makefile && make all”

In the attached folders you can see the Makefile I created.

Task 1

The three c programs all perform the same task of printing “Hello World!”, but they do it in 3 different ways.

Program 1: Uses printf()
Program 2: Uses write()
Program 3: Uses syscall()

Task 2

After Reviewing the output of the objdump I can see that program 1 uses the least amount of code at 8 lines but it is using printf which has the most overhead of the three functions. Program 2 using write which should have less overhead uses 12 lines of code. And finally program 3 also uses 12 lines of code and since we are using a syscall we have very little overhead.

Task 3

Yes, Since we are now compiling straight from assembler we don’t have the overhead of the c language. This cut the program down in size drastically now the how objdump file is only 11 lines of code.

Task 4

Here is the total line count the three c programs took to run on aarch64. Pretty similar results.

Program 1: 10 lines
Program 2: 12 lines
Program 3: 12 lines

Task 5

Surprisingly, the results are identical to the x86 in term of line count. The aarch64 Hello world program used 11 lines of code the same as x86.

Something interesting I noticed about the compiled code is that it transformed all the numbers to hexadecimal.

Task 6

Here is my loops 0-9 on x86 and aarch64.

/* x86 */
.text
.globl    _start

start = 0                       /* starting value for the loop index; note that this is a symbol (constant), not a variable */
max = 10                        /* loop exits when the index hits this number (loop condition is i<max) */

_start:
    mov     $start,%r15         /* loop index */
   
loop:
    /* ... body of the loop ... do something useful here ... */
    mov     $len,%rdx
    
    mov     $48,%r14
    add     %r15,%r14
    
    movb    %r14b,msg+6
    mov     $msg,%rsi

    mov     $1,%rdi
    mov     $1,%rax
    syscall 

    inc     %r15                /* increment index */
    cmp     $max,%r15           /* see if we're done */
    jne     loop                /* loop if we're not */

    mov     $0,%rdi             /* exit status */
    mov     $60,%rax            /* syscall sys_exit */
    syscall
.data 
msg: .ascii "Loop:  \n"
    len = . - msg
/* aarch64 */
.text
.globl    _start

start = 0                       /* starting value for the loop index; note that this is a symbol (constant), not a variable */
max = 10                        /* loop exits when the index hits this number (loop condition is i<max) */

_start:
    mov     x30,start           /* loop index */
    
loop:
    
    mov     x19,48
    mov     x26,max
    mov     x27,1
    adr     x28,msg

    add     x19,x30,x19

    strb     w19,[x28,6]
    ldr      x1,=msg
        
    mov     x0,1
    mov     x2,len
    mov     x8, 64
    svc     0

    add     x30,x27,x30             /* increment index */
    cmp     x26,x30                 /* see if we're done */
    b.ne    loop                   /* loop if we're not */

    mov     x8,93                   /* syscall sys_exit */
    svc     0

    .data

    msg: .ascii "Loop:      \n"
    len = . - msg

Task 7

Here is my loops 0-30 with the leading zero’s removed on x86 and aarch64.

/* x86 */
.text
.globl    _start

start = 0                       /* starting value for the loop index; note that this is a symbol (constant), not a variable */
max = 31                        /* loop exits when the index hits this number (loop condition is i<max) */

_start:
    mov     $start,%r15         /* loop index */
    
loop:
    /* ... body of the loop ... do something useful here ... */
    
   
    mov     $48,%r13
    mov     $48,%r14
    mov     $0,%rdx

    mov     %r15,%rax
    mov     $10,%r12
    div     %r12

    
    add     %rax,%r13
    add     %rdx,%r14
    
    cmp     $48,%r13                   /*Compare*/
    
    je continue     
    
    movb    %r13b,msg+6

continue:

    movb    %r14b,msg+7
    mov     $msg,%rsi /*send message to reg rsi*/
        
    mov     $1,%rdi
    mov     $1,%rax
    mov     $len,%rdx

    syscall

    inc     %r15                /* increment index */
    cmp     $max,%r15           /* see if we're done */
    jne     loop                /* loop if we're not */

    mov     $0,%rdi             /* exit status */
    mov     $60,%rax            /* syscall sys_exit */
    syscall

    .data

    msg: .ascii "Loop:   \n"
    len = . - msg
/* aarch64 */
.text
.globl    _start

start = 0                       /* starting value for the loop index; note that this is a symbol (constant), not a variable */
max = 30                        /* loop exits when the index hits this number (loop condition is i<max) */

_start:
    mov     x30,start           /* loop index */
    
loop:
    
    mov     x19,48
    mov     x20,48
    mov     x24,10  
    mov     x25,48
    mov     x26,max
    mov     x27,1
    adr     x28,msg

    
    udiv    x21,x30,x24
    msub    x22,x21,x24,x30

    add     x19,x21,x19
    add     x20,x22,x20

    cmp     x25,x19
    b.eq    continue     
    
    strb     w19,[x28,6]

continue:

    strb     w20,[x28,7]
    ldr      x1,=msg
        
    mov     x0,1
    mov     x2,len
    mov     x8, 64
    svc     0

    add     x30,x27,x30             /* increment index */
    cmp     x26,x30                 /* see if we're done */
    b.ne    loop                   /* loop if we're not */

    mov     x8,93                   /* syscall sys_exit */
    svc     0

    .data

    msg: .ascii "Loop:      \n"
    len = . - msg

Download my files

Code Review – HelloWorld.c Lab02

Hello,

In the following post, I will be looking at seven different ways that you can compile a simple hello world program in C.

I will be using the following four flags for the GCC compiler.


-g               # enable debugging information
-O0              # do not optimize
-fno-builtin     # do not use builtin function optimizations
-static          # includes the header files in the executable

After I compile using gcc, I will look at things like the filesize and the decompiled assembler to gather my results.

Okay, so test one is to compile the following program with -g -O0 -fno-builtin. This test will be the control for this experiment. We will be basing or conclusions for the other tests of this one.


$ gcc -g -O0 -fno-builtin -o test test.c

#include 
int main(){
    printf("Hello World!\n");
}


$ objdump -fsd --source test

The command “objdump” will allow me to view the compiled code. You will probably want to pipe this command into less or send to an output file.

The results of this command will be extensive, so you will want to use a search to find “<main>.”


// Total File Size 24656 bytes
#include <stdio.h>
int main(){
  401126:	55                   	push   %rbp
  401127:	48 89 e5             	mov    %rsp,%rbp
    printf("Hello World!\n");
  40112a:	bf 10 20 40 00       	mov    $0x402010,%edi
  40112f:	b8 00 00 00 00       	mov    $0x0,%eax
  401134:	e8 f7 fe ff ff       	callq  401030 <printf@plt>
  401139:	b8 00 00 00 00       	mov    $0x0,%eax
  40113e:	5d                   	pop    %rbp
  40113f:	c3                   	retq   

Alright, now we have our control results we can now start some tests.
Here is a link to an assembler quick start guide if you need.

TEST 1

Add the compiler option -static.


$ gcc -static -g -O0 -fno-builtin -o test1 test.c

// Code
#include <stdio.h>
int main(){
    printf("Hello World!\n");
}

$ objdump -fsd --source test1 >> test1.text

// Total File Size 1720896 bytes
#include 
int main(){
  401bb5:	55                   	push   %rbp
  401bb6:	48 89 e5             	mov    %rsp,%rbp
    printf("Hello World!\n");
  401bb9:	bf 10 00 48 00       	mov    $0x480010,%edi
  401bbe:	b8 00 00 00 00       	mov    $0x0,%eax
  401bc3:	e8 f8 72 00 00       	callq  408ec0 <_IO_printf>
  401bc8:	b8 00 00 00 00       	mov    $0x0,%eax
  401bcd:	5d                   	pop    %rbp
  401bce:	c3                   	retq   
  401bcf:	90                   	nop 

If we review the results the assembler, the “printf” gets called from inside the file versus linking to another file. And, if we look at the file size, it is now a lot larger. Due to the -static, it has made it, so it included the header with the assembled code.

TEST 2

Remove the compiler option -fno-builtin.


$ gcc -g -O0 -o test2 test.c

// Code
#include <stdio.h>
int main(){
    printf("Hello World!\n");
}

$ objdump -fsd --source test2 >> test2.text

// Total File Size 24648 bytes
#include <stdio.h>
int main(){
  401126:	55                   	push   %rbp
  401127:	48 89 e5             	mov    %rsp,%rbp
    printf("Hello World!\n");
  40112a:	bf 10 20 40 00       	mov    $0x402010,%edi
  40112f:	e8 fc fe ff ff       	callq  401030 <puts@plt>
  401134:	b8 00 00 00 00       	mov    $0x0,%eax
  401139:	5d                   	pop    %rbp
  40113a:	c3                   	retq   
  40113b:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)

If we review the results of the assembler, we can see that we are no longer using “printf” the compiler has replaced it with the “puts” function.

TEST 3

Remove the compiler option -g.


$ gcc -O0 -fno-builtin -o ../excabutables/test3 test.c

// Code
#include <stdio.h>
int main(){
    printf("Hello World!\n");
}

$ objdump -fsd --source test3 >> test3.text

// Total File Size 22272 bytes
  401126:	55                   	push   %rbp
  401127:	48 89 e5             	mov    %rsp,%rbp
  40112a:	bf 10 20 40 00       	mov    $0x402010,%edi
  40112f:	b8 00 00 00 00       	mov    $0x0,%eax
  401134:	e8 f7 fe ff ff       	callq  401030 <printf@plt>
  401139:	b8 00 00 00 00       	mov    $0x0,%eax
  40113e:	5d                   	pop    %rbp
  40113f:	c3                   	retq

If we review the results of the assembler, we can see that we are no longer able to see the debugger information like the header include, or code.

TEST 4

Add additional arguments to the printf() function in your program.


$ gcc -g -O0 -fno-builtin -o test4 testMultiArgs.c

// Code
#include <stdio.h>
int main(){
    printf("Hello World!!!, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d \n", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
}

$ objdump -fsd --source test4 >> test4.text


// Total File Size 24688 bytes
#include <stdio.h>
int main(){
  401126:	55                   	push   %rbp
  401127:	48 89 e5             	mov    %rsp,%rbp
    printf("Hello World!!!, %d, %d, %d, %d, %d, %d, %d, %d, %d, %d \n", 1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
  40112a:	48 83 ec 08          	sub    $0x8,%rsp
  40112e:	6a 0a                	pushq  $0xa
  401130:	6a 09                	pushq  $0x9
  401132:	6a 08                	pushq  $0x8
  401134:	6a 07                	pushq  $0x7
  401136:	6a 06                	pushq  $0x6
  401138:	41 b9 05 00 00 00    	mov    $0x5,%r9d
  40113e:	41 b8 04 00 00 00    	mov    $0x4,%r8d
  401144:	b9 03 00 00 00       	mov    $0x3,%ecx
  401149:	ba 02 00 00 00       	mov    $0x2,%edx
  40114e:	be 01 00 00 00       	mov    $0x1,%esi
  401153:	bf 10 20 40 00       	mov    $0x402010,%edi
  401158:	b8 00 00 00 00       	mov    $0x0,%eax
  40115d:	e8 ce fe ff ff       	callq  401030 <printf@plt>
  401162:	48 83 c4 30          	add    $0x30,%rsp
  401166:	b8 00 00 00 00       	mov    $0x0,%eax
  40116b:	c9                   	leaveq 
  40116c:	c3                   	retq   
  40116d:	0f 1f 00             	nopl   (%rax)

If we review the results, we can see all the steps needed to perform a “printf” with ten arguments.

TEST 5

Move the printf() call to a separate function named output()


$ gcc -g -O0 -fno-builtin -o test5 testFunctionCall.c

// Code
#include <stdio.h>
void output(){
    printf("hello World");
}
int main(){
    output();
}

$ objdump -fsd --source test5 >> test5.text


// Total File Size 24792 bytes
int main(){
  40113c:	55                   	push   %rbp
  40113d:	48 89 e5             	mov    %rsp,%rbp
    output();
  401140:	b8 00 00 00 00       	mov    $0x0,%eax
  401145:	e8 dc ff ff ff       	callq  401126 
  40114a:	b8 00 00 00 00       	mov    $0x0,%eax
  40114f:	5d                   	pop    %rbp
  401150:	c3                   	retq   
  401151:	66 2e 0f 1f 84 00 00 	nopw   %cs:0x0(%rax,%rax,1)
  401158:	00 00 00 
  40115b:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)

#include <stdio.h>
void output(){
  401126:	55                   	push   %rbp
  401127:	48 89 e5             	mov    %rsp,%rbp
    printf("hello World");
  40112a:	bf 10 20 40 00       	mov    $0x402010,%edi
  40112f:	b8 00 00 00 00       	mov    $0x0,%eax
  401134:	e8 f7 fe ff ff       	callq  401030 
}
  401139:	90                   	nop
  40113a:	5d                   	pop    %rbp
  40113b:	c3                   	retq

If we review the results, we can see that we are now using the output function for the “printf.”

TEST 6

Remove -O0 and add -O3 to the gcc options.


$ gcc -g -fno-builtin -O3 -o test6 test.c

// Code
#include <stdio.h>
int main(){
    printf("Hello World!\n");
}

$ objdump -fsd --source test6 >> test6.text

// Total File Size 24896 bytes
#include <stdio.h>
int main(){
  401040:	48 83 ec 08          	sub    $0x8,%rsp
    printf("Hello World!\n");
  401044:	bf 10 20 40 00       	mov    $0x402010,%edi
  401049:	31 c0                	xor    %eax,%eax
  40104b:	e8 e0 ff ff ff       	callq  401030 <printf@plt>
  401050:	31 c0                	xor    %eax,%eax
  401052:	48 83 c4 08          	add    $0x8,%rsp
  401056:	c3                   	retq 

If we review the results, we can see that the -O3 gcc option has swapped a bunch of operations with more efficient versions.

Download my files