With the following IR: ``` target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" %struct.A = type { i32, i32, i32, i32, i32, i32, i8*, i8*, i8*, i8*, i8*, i8*, i8* } define dso_local i32 @g(i32 %0, i32 %1, i32 %2, i8* %3, %struct.A* byval(%struct.A) align 8 %4) local_unnamed_addr { %6 = tail call i32 @k(i32 %0) %7 = tail call i32 @f(i32 %0, i32 %1, i32 %2, i8* %3, %struct.A* nonnull byval(%struct.A) align 8 %4) ret i32 %7 } declare i32 @k(i32) declare i32 @f(i32, i32, i32, i8*, %struct.A* byval(%struct.A) align 8) ``` the emitted assembly with `llc` is (unwind info stripped out) ``` pushq %rbp pushq %r15 pushq %r14 pushq %rbx pushq %rax movq %rcx, %r14 movl %edx, %r15d movl %esi, %ebx movl %edi, %ebp callq k movl %ebp, %edi movl %ebx, %esi movl %r15d, %edx movq %r14, %rcx addq $8, %rsp popq %rbx popq %r14 popq %r15 popq %rbp jmp f # TAILCALL ``` which seems correct and is actually a tail call. However, if `musttail` is used, i.e. ``` target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" %struct.A = type { i32, i32, i32, i32, i32, i32, i8*, i8*, i8*, i8*, i8*, i8*, i8* } define dso_local i32 @g(i32 %0, i32 %1, i32 %2, i8* %3, %struct.A* byval(%struct.A) align 8 %4) local_unnamed_addr { %6 = tail call i32 @k(i32 %0) %7 = musttail call i32 @f(i32 %0, i32 %1, i32 %2, i8* %3, %struct.A* nonnull byval(%struct.A) align 8 %4) ret i32 %7 } declare i32 @k(i32) declare i32 @f(i32, i32, i32, i8*, %struct.A* byval(%struct.A) align 8) ``` The code produced is then, ``` pushq %rbp pushq %r15 pushq %r14 pushq %rbx pushq %rax movq %rcx, %r14 movl %edx, %r15d movl %esi, %ebx movl %edi, %ebp callq k leaq 48(%rsp), %rsi movl $10, %ecx movq %rsp, %rdi rep;movsq (%rsi), %es:(%rdi) leaq 48(%rsp), %rdi movl $10, %ecx movq %rsp, %rsi rep;movsq (%rsi), %es:(%rdi) movl %ebp, %edi movl %ebx, %esi movl %r15d, %edx movq %r14, %rcx addq $8, %rsp popq %rbx popq %r14 popq %r15 popq %rbp jmp f # TAILCALL ``` which tries to copy 80 bytes of data from the argument memory onto 48 bytes of reserved stack space that overwrites the saved register values as well as the argument values before copying them back causing more damange...... This bug seems to be superficially similar to gcc's https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96539 though gcc merely produced inefficient code whereas the code produced here is wrong. ----- `llc -O0` (rather than the default optimization level) produces something worse ``` subq $56, %rsp .cfi_def_cfa_offset 64 leaq 64(%rsp), %rax movl %edi, 52(%rsp) # 4-byte Spill movq %rcx, 40(%rsp) # 8-byte Spill movl %edx, 36(%rsp) # 4-byte Spill movl %esi, 32(%rsp) # 4-byte Spill movq %rax, 24(%rsp) # 8-byte Spill callq k movq %rsp, %rcx movl $10, %r8d movq %rcx, 16(%rsp) # 8-byte Spill movq %r8, %rcx movq 16(%rsp), %rdi # 8-byte Reload movq 24(%rsp), %rsi # 8-byte Reload rep;movsq (%rsi), %es:(%rdi) leaq 64(%rsp), %rdi movq %r8, %rcx movq 16(%rsp), %rsi # 8-byte Reload rep;movsq (%rsi), %es:(%rdi) movl 52(%rsp), %edi # 4-byte Reload movl 32(%rsp), %esi # 4-byte Reload movl 36(%rsp), %edx # 4-byte Reload movq 40(%rsp), %rcx # 8-byte Reload movl %eax, 12(%rsp) # 4-byte Spill addq $56, %rsp .cfi_def_cfa_offset 8 jmp f # TAILCALL ``` which seems to got the source address for the first copy wrong, in additional to the aliasing issue and the unnecessary copy itself...
For a simpler case, ``` target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" %struct.A = type { i8*, i8*, i8*, i8*, i8*, i8*, i8* } define dso_local i32 @g(%struct.A* byval(%struct.A) align 8 %0) { %2 = tail call i32 @k() %3 = tail call i32 @f(%struct.A* nonnull byval(%struct.A) align 8 %0) ret i32 %3 } declare i32 @k() declare i32 @f(%struct.A* byval(%struct.A) align 8) ``` Right after "X86 DAG->DAG Instruction Selection" it becomes, ``` Frame Objects: fi#-2: size=8, align=16, fixed, at location [SP+8] fi#-1: size=56, align=16, fixed, at location [SP+8] bb.0 (%ir-block.1): ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp CALL64pcrel32 @k, <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $eax ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp %0:gr32 = COPY $eax %1:gr64 = MOV64rm %fixed-stack.1, 1, $noreg, 48, $noreg :: (load 8 from %fixed-stack.1 + 48, align 16) %2:gr64 = COPY $rsp MOV64mr %2:gr64, 1, $noreg, 48, $noreg, %1:gr64 :: (store 8) %3:gr64 = MOV64rm %fixed-stack.1, 1, $noreg, 40, $noreg :: (load 8 from %fixed-stack.1 + 40) MOV64mr %2:gr64, 1, $noreg, 40, $noreg, %3:gr64 :: (store 8) %4:gr64 = MOV64rm %fixed-stack.1, 1, $noreg, 32, $noreg :: (load 8 from %fixed-stack.1 + 32, align 16) MOV64mr %2:gr64, 1, $noreg, 32, $noreg, %4:gr64 :: (store 8) %5:gr64 = MOV64rm %fixed-stack.1, 1, $noreg, 24, $noreg :: (load 8 from %fixed-stack.1 + 24) MOV64mr %2:gr64, 1, $noreg, 24, $noreg, %5:gr64 :: (store 8) %6:gr64 = MOV64rm %fixed-stack.1, 1, $noreg, 16, $noreg :: (load 8 from %fixed-stack.1 + 16, align 16) MOV64mr %2:gr64, 1, $noreg, 16, $noreg, %6:gr64 :: (store 8) %7:gr64 = MOV64rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load 8 from %fixed-stack.1, align 16) %8:gr64 = MOV64rm %fixed-stack.1, 1, $noreg, 8, $noreg :: (load 8 from %fixed-stack.1 + 8) MOV64mr %2:gr64, 1, $noreg, 8, $noreg, %8:gr64 :: (store 8) MOV64mr %2:gr64, 1, $noreg, 0, $noreg, %7:gr64 :: (store 8) MOV64mr %fixed-stack.0, 1, $noreg, 48, $noreg, %1:gr64 :: (store 8 into %fixed-stack.0 + 48, align 16) MOV64mr %fixed-stack.0, 1, $noreg, 40, $noreg, %3:gr64 :: (store 8 into %fixed-stack.0 + 40) MOV64mr %fixed-stack.0, 1, $noreg, 32, $noreg, %4:gr64 :: (store 8 into %fixed-stack.0 + 32, align 16) MOV64mr %fixed-stack.0, 1, $noreg, 24, $noreg, %5:gr64 :: (store 8 into %fixed-stack.0 + 24) MOV64mr %fixed-stack.0, 1, $noreg, 16, $noreg, %6:gr64 :: (store 8 into %fixed-stack.0 + 16, align 16) MOV64mr %fixed-stack.0, 1, $noreg, 8, $noreg, %8:gr64 :: (store 8 into %fixed-stack.0 + 8) MOV64mr %fixed-stack.0, 1, $noreg, 0, $noreg, %7:gr64 :: (store 8 into %fixed-stack.0, align 16) TCRETURNdi64 @f, 0, <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp ``` ``` Frame Objects: fi#-2: size=8, align=16, fixed, at location [SP+8] fi#-1: size=56, align=16, fixed, at location [SP+8] bb.0 (%ir-block.1): ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp CALL64pcrel32 @k, <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def dead $eax ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp renamable $r8 = MOV64rm %fixed-stack.1, 1, $noreg, 48, $noreg :: (load 8 from %fixed-stack.1 + 48, align 16) MOV64mr $rsp, 1, $noreg, 48, $noreg, renamable $r8 :: (store 8) renamable $r9 = MOV64rm %fixed-stack.1, 1, $noreg, 40, $noreg :: (load 8 from %fixed-stack.1 + 40) MOV64mr $rsp, 1, $noreg, 40, $noreg, renamable $r9 :: (store 8) renamable $rdx = MOV64rm %fixed-stack.1, 1, $noreg, 32, $noreg :: (load 8 from %fixed-stack.1 + 32, align 16) MOV64mr $rsp, 1, $noreg, 32, $noreg, renamable $rdx :: (store 8) renamable $rsi = MOV64rm %fixed-stack.1, 1, $noreg, 24, $noreg :: (load 8 from %fixed-stack.1 + 24) MOV64mr $rsp, 1, $noreg, 24, $noreg, renamable $rsi :: (store 8) renamable $rdi = MOV64rm %fixed-stack.1, 1, $noreg, 16, $noreg :: (load 8 from %fixed-stack.1 + 16, align 16) MOV64mr $rsp, 1, $noreg, 16, $noreg, renamable $rdi :: (store 8) renamable $rax = MOV64rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load 8 from %fixed-stack.1, align 16) renamable $rcx = MOV64rm %fixed-stack.1, 1, $noreg, 8, $noreg :: (load 8 from %fixed-stack.1 + 8) MOV64mr $rsp, 1, $noreg, 8, $noreg, renamable $rcx :: (store 8) MOV64mr $rsp, 1, $noreg, 0, $noreg, renamable $rax :: (store 8) MOV64mr %fixed-stack.0, 1, $noreg, 48, $noreg, killed renamable $r8 :: (store 8 into %fixed-stack.0 + 48, align 16) MOV64mr %fixed-stack.0, 1, $noreg, 40, $noreg, killed renamable $r9 :: (store 8 into %fixed-stack.0 + 40) MOV64mr %fixed-stack.0, 1, $noreg, 32, $noreg, killed renamable $rdx :: (store 8 into %fixed-stack.0 + 32, align 16) MOV64mr %fixed-stack.0, 1, $noreg, 24, $noreg, killed renamable $rsi :: (store 8 into %fixed-stack.0 + 24) MOV64mr %fixed-stack.0, 1, $noreg, 16, $noreg, killed renamable $rdi :: (store 8 into %fixed-stack.0 + 16, align 16) MOV64mr %fixed-stack.0, 1, $noreg, 8, $noreg, killed renamable $rcx :: (store 8 into %fixed-stack.0 + 8) MOV64mr %fixed-stack.0, 1, $noreg, 0, $noreg, killed renamable $rax :: (store 8 into %fixed-stack.0, align 16) TCRETURNdi64 @f, 0, <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp ``` and remains largely unchanged other than register allocation until before "Prologue/Epilogue Insertion & Frame Finalization" where it changed from ``` Frame Objects: fi#-2: size=8, align=16, fixed, at location [SP+8] fi#-1: size=56, align=16, fixed, at location [SP+8] bb.0 (%ir-block.1): ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp CALL64pcrel32 @k, <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def dead $eax ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp renamable $r8 = MOV64rm %fixed-stack.1, 1, $noreg, 48, $noreg :: (load 8 from %fixed-stack.1 + 48, align 16) MOV64mr $rsp, 1, $noreg, 48, $noreg, renamable $r8 :: (store 8) renamable $r9 = MOV64rm %fixed-stack.1, 1, $noreg, 40, $noreg :: (load 8 from %fixed-stack.1 + 40) MOV64mr $rsp, 1, $noreg, 40, $noreg, renamable $r9 :: (store 8) renamable $rdx = MOV64rm %fixed-stack.1, 1, $noreg, 32, $noreg :: (load 8 from %fixed-stack.1 + 32, align 16) MOV64mr $rsp, 1, $noreg, 32, $noreg, renamable $rdx :: (store 8) renamable $rsi = MOV64rm %fixed-stack.1, 1, $noreg, 24, $noreg :: (load 8 from %fixed-stack.1 + 24) MOV64mr $rsp, 1, $noreg, 24, $noreg, renamable $rsi :: (store 8) renamable $rdi = MOV64rm %fixed-stack.1, 1, $noreg, 16, $noreg :: (load 8 from %fixed-stack.1 + 16, align 16) MOV64mr $rsp, 1, $noreg, 16, $noreg, renamable $rdi :: (store 8) renamable $rax = MOV64rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load 8 from %fixed-stack.1, align 16) renamable $rcx = MOV64rm %fixed-stack.1, 1, $noreg, 8, $noreg :: (load 8 from %fixed-stack.1 + 8) MOV64mr $rsp, 1, $noreg, 8, $noreg, renamable $rcx :: (store 8) MOV64mr $rsp, 1, $noreg, 0, $noreg, renamable $rax :: (store 8) MOV64mr %fixed-stack.0, 1, $noreg, 48, $noreg, killed renamable $r8 :: (store 8 into %fixed-stack.0 + 48, align 16) MOV64mr %fixed-stack.0, 1, $noreg, 40, $noreg, killed renamable $r9 :: (store 8 into %fixed-stack.0 + 40) MOV64mr %fixed-stack.0, 1, $noreg, 32, $noreg, killed renamable $rdx :: (store 8 into %fixed-stack.0 + 32, align 16) MOV64mr %fixed-stack.0, 1, $noreg, 24, $noreg, killed renamable $rsi :: (store 8 into %fixed-stack.0 + 24) MOV64mr %fixed-stack.0, 1, $noreg, 16, $noreg, killed renamable $rdi :: (store 8 into %fixed-stack.0 + 16, align 16) MOV64mr %fixed-stack.0, 1, $noreg, 8, $noreg, killed renamable $rcx :: (store 8 into %fixed-stack.0 + 8) MOV64mr %fixed-stack.0, 1, $noreg, 0, $noreg, killed renamable $rax :: (store 8 into %fixed-stack.0, align 16) TCRETURNdi64 @f, 0, <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp ``` to ``` Frame Objects: fi#-2: size=8, align=16, fixed, at location [SP+8] fi#-1: size=56, align=16, fixed, at location [SP+8] bb.0 (%ir-block.1): frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp CFI_INSTRUCTION def_cfa_offset 16 CALL64pcrel32 @k, <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def dead $eax renamable $r8 = MOV64rm $rsp, 1, $noreg, 64, $noreg :: (load 8 from %fixed-stack.1 + 48, align 16) MOV64mr $rsp, 1, $noreg, 48, $noreg, renamable $r8 :: (store 8) renamable $r9 = MOV64rm $rsp, 1, $noreg, 56, $noreg :: (load 8 from %fixed-stack.1 + 40) MOV64mr $rsp, 1, $noreg, 40, $noreg, renamable $r9 :: (store 8) renamable $rdx = MOV64rm $rsp, 1, $noreg, 48, $noreg :: (load 8 from %fixed-stack.1 + 32, align 16) MOV64mr $rsp, 1, $noreg, 32, $noreg, renamable $rdx :: (store 8) renamable $rsi = MOV64rm $rsp, 1, $noreg, 40, $noreg :: (load 8 from %fixed-stack.1 + 24) MOV64mr $rsp, 1, $noreg, 24, $noreg, renamable $rsi :: (store 8) renamable $rdi = MOV64rm $rsp, 1, $noreg, 32, $noreg :: (load 8 from %fixed-stack.1 + 16, align 16) MOV64mr $rsp, 1, $noreg, 16, $noreg, renamable $rdi :: (store 8) renamable $rax = MOV64rm $rsp, 1, $noreg, 16, $noreg :: (load 8 from %fixed-stack.1, align 16) renamable $rcx = MOV64rm $rsp, 1, $noreg, 24, $noreg :: (load 8 from %fixed-stack.1 + 8) MOV64mr $rsp, 1, $noreg, 8, $noreg, renamable $rcx :: (store 8) MOV64mr $rsp, 1, $noreg, 0, $noreg, renamable $rax :: (store 8) MOV64mr $rsp, 1, $noreg, 64, $noreg, killed renamable $r8 :: (store 8 into %fixed-stack.0 + 48, align 16) MOV64mr $rsp, 1, $noreg, 56, $noreg, killed renamable $r9 :: (store 8 into %fixed-stack.0 + 40) MOV64mr $rsp, 1, $noreg, 48, $noreg, killed renamable $rdx :: (store 8 into %fixed-stack.0 + 32, align 16) MOV64mr $rsp, 1, $noreg, 40, $noreg, killed renamable $rsi :: (store 8 into %fixed-stack.0 + 24) MOV64mr $rsp, 1, $noreg, 32, $noreg, killed renamable $rdi :: (store 8 into %fixed-stack.0 + 16, align 16) MOV64mr $rsp, 1, $noreg, 24, $noreg, killed renamable $rcx :: (store 8 into %fixed-stack.0 + 8) MOV64mr $rsp, 1, $noreg, 16, $noreg, killed renamable $rax :: (store 8 into %fixed-stack.0, align 16) $rax = frame-destroy POP64r implicit-def $rsp, implicit $rsp CFI_INSTRUCTION def_cfa_offset 8 TCRETURNdi64 @f, 0, <regmask $bh $bl $bp $bph $bpl $bx $ebp $ebx $hbp $hbx $rbp $rbx $r12 $r13 $r14 $r15 $r12b $r13b $r14b $r15b $r12bh $r13bh $r14bh $r15bh $r12d $r13d $r14d $r15d $r12w $r13w $r14w $r15w $r12wh and 3 more...>, implicit $rsp, implicit $ssp ``` by what basically seems like replacing %fixed-stack.1 with rsp + 8. I'm not sure what other unprinted assumptions are passed around but it basically seems like something decided that the argument (fi#-1) should be copied to the current stack and back. But someone else decided that the current stack doesn't need to be adjusted so things are just being copied to and back over random other stack space...