Trying llvm3.6rc1 with Numba, I get crashes on some vectorized code for ordered comparisons. For a loop running "fcmp oge" over an array of doubles, generated assembly looks like this (x86-64): .align 8 .LCPI0_0: .quad 4607182418800017408 .text .globl "__main__.fn.float64.float64.array(float64,_1d,_C,_nonconst)" .align 16, 0x90 .type "__main__.fn.float64.float64.array(float64,_1d,_C,_nonconst)",@function "__main__.fn.float64.float64.array(float64,_1d,_C,_nonconst)": movq 32(%rdx), %r8 testq %r8, %r8 jle .LBB0_19 movq 24(%rdx), %rcx cmplesd %xmm0, %xmm1 movabsq $.LCPI0_0, %rax andpd (%rax), %xmm1 [snip] The "andpd (%rax), %xmm1" crashes. AFAIU, that's because $.LCPI0_0 is not 16-byte aligned. A quick look with gdb seems to confirm that hypothesis: (gdb) info registers rax rax 0x7ffff7fea058 140737354047576 (gdb) x/2 0x7ffff7fea058 0x7ffff7fea058: 0x00000000 0x3ff00000
Here is the IR as optimized by LLVM: ; ModuleID = '<string>' target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; snip unrelated stuff ; Function Attrs: nounwind define i32 @"__main__.fn.float64.float64.array(float64,_1d,_C,_nonconst)"(i8** nocapture %ret, i8* nocapture readnone %env, double %arg.x, double %arg.y, { i8*, i64, i64, double*, [1 x i64], [1 x i64] }* nocapture readonly %arg.out) #0 { entry: %.8 = load { i8*, i64, i64, double*, [1 x i64], [1 x i64] }* %arg.out, align 8 %.8.fca.3.extract = extractvalue { i8*, i64, i64, double*, [1 x i64], [1 x i64] } %.8, 3 %.8.fca.4.0.extract = extractvalue { i8*, i64, i64, double*, [1 x i64], [1 x i64] } %.8, 4, 0 %.351 = icmp sgt i64 %.8.fca.4.0.extract, 0 br i1 %.351, label %for.body.lr.ph, label %for.end for.body.lr.ph: ; preds = %entry %.37 = fcmp oge double %arg.x, %arg.y %.38 = zext i1 %.37 to i32 %.39 = sitofp i32 %.38 to double %backedge.overflow = icmp eq i64 %.8.fca.4.0.extract, 0 br i1 %backedge.overflow, label %for.body.preheader, label %overflow.checked overflow.checked: ; preds = %for.body.lr.ph %n.vec = and i64 %.8.fca.4.0.extract, -4 %cmp.zero = icmp eq i64 %n.vec, 0 br i1 %cmp.zero, label %middle.block, label %vector.ph vector.ph: ; preds = %overflow.checked %broadcast.splatinsert4 = insertelement <2 x double> undef, double %.39, i32 0 %broadcast.splat5 = shufflevector <2 x double> %broadcast.splatinsert4, <2 x double> undef, <2 x i32> zeroinitializer %0 = lshr i64 %.8.fca.4.0.extract, 2 %1 = mul i64 %0, 4 %2 = add i64 %1, -4 %3 = lshr i64 %2, 2 %4 = add i64 %3, 1 %xtraiter9 = and i64 %4, 3 %lcmp.mod10 = icmp ne i64 %xtraiter9, 0 %lcmp.overflow11 = icmp eq i64 %4, 0 %lcmp.or12 = or i1 %lcmp.overflow11, %lcmp.mod10 br i1 %lcmp.or12, label %vector.body.prol.preheader, label %vector.ph.split vector.body.prol.preheader: ; preds = %vector.ph %5 = lshr i64 %.8.fca.4.0.extract, 2 %6 = mul i64 %5, 4 %7 = add i64 %6, -4 %8 = lshr i64 %7, 2 %9 = add i64 %8, 1 %10 = trunc i64 %9 to i2 %11 = zext i2 %10 to i64 %12 = sub i64 0, %11 br label %vector.body.prol vector.body.prol: ; preds = %vector.body.prol.preheader, %vector.body.prol %lsr.iv47 = phi i64 [ %12, %vector.body.prol.preheader ], [ %lsr.iv.next48, %vector.body.prol ] %index.prol = phi i64 [ %index.next.prol, %vector.body.prol ], [ 0, %vector.body.prol.preheader ] %sunkaddr = ptrtoint double* %.8.fca.3.extract to i64 %sunkaddr49 = mul i64 %index.prol, 8 %sunkaddr50 = add i64 %sunkaddr, %sunkaddr49 %sunkaddr51 = inttoptr i64 %sunkaddr50 to <2 x double>* store <2 x double> %broadcast.splat5, <2 x double>* %sunkaddr51, align 8 %sunkaddr52 = ptrtoint double* %.8.fca.3.extract to i64 %sunkaddr53 = mul i64 %index.prol, 8 %sunkaddr54 = add i64 %sunkaddr52, %sunkaddr53 %sunkaddr55 = add i64 %sunkaddr54, 16 %sunkaddr56 = inttoptr i64 %sunkaddr55 to <2 x double>* store <2 x double> %broadcast.splat5, <2 x double>* %sunkaddr56, align 8 %index.next.prol = add i64 %index.prol, 4 %lsr.iv.next48 = add i64 %lsr.iv47, 1 %prol.iter13.cmp = icmp ne i64 %lsr.iv.next48, 0 br i1 %prol.iter13.cmp, label %vector.body.prol, label %vector.ph.split, !llvm.loop !0 vector.ph.split: ; preds = %vector.body.prol, %vector.ph %index.unr = phi i64 [ 0, %vector.ph ], [ %index.next.prol, %vector.body.prol ] %13 = icmp ult i64 %4, 4 br i1 %13, label %middle.block, label %vector.ph.split.split vector.ph.split.split: ; preds = %vector.ph.split br label %vector.body vector.body: ; preds = %vector.body, %vector.ph.split.split %index = phi i64 [ %index.unr, %vector.ph.split.split ], [ %25, %vector.body ] %sunkaddr57 = ptrtoint double* %.8.fca.3.extract to i64 %sunkaddr58 = mul i64 %index, 8 %sunkaddr59 = add i64 %sunkaddr57, %sunkaddr58 %sunkaddr60 = inttoptr i64 %sunkaddr59 to <2 x double>* store <2 x double> %broadcast.splat5, <2 x double>* %sunkaddr60, align 8 %.sum8 = or i64 %index, 2 %14 = getelementptr double* %.8.fca.3.extract, i64 %.sum8 %15 = bitcast double* %14 to <2 x double>* store <2 x double> %broadcast.splat5, <2 x double>* %15, align 8 %sunkaddr61 = ptrtoint double* %.8.fca.3.extract to i64 %sunkaddr62 = mul i64 %index, 8 %sunkaddr63 = add i64 %sunkaddr61, %sunkaddr62 %sunkaddr64 = add i64 %sunkaddr63, 32 %sunkaddr65 = inttoptr i64 %sunkaddr64 to <2 x double>* store <2 x double> %broadcast.splat5, <2 x double>* %sunkaddr65, align 8 %16 = add i64 %index, 4 %.sum8.1 = or i64 %16, 2 %17 = getelementptr double* %.8.fca.3.extract, i64 %.sum8.1 %18 = bitcast double* %17 to <2 x double>* store <2 x double> %broadcast.splat5, <2 x double>* %18, align 8 %sunkaddr66 = ptrtoint double* %.8.fca.3.extract to i64 %sunkaddr67 = mul i64 %index, 8 %sunkaddr68 = add i64 %sunkaddr66, %sunkaddr67 %sunkaddr69 = add i64 %sunkaddr68, 64 %sunkaddr70 = inttoptr i64 %sunkaddr69 to <2 x double>* store <2 x double> %broadcast.splat5, <2 x double>* %sunkaddr70, align 8 %19 = add i64 %16, 4 %.sum8.2 = or i64 %19, 2 %20 = getelementptr double* %.8.fca.3.extract, i64 %.sum8.2 %21 = bitcast double* %20 to <2 x double>* store <2 x double> %broadcast.splat5, <2 x double>* %21, align 8 %sunkaddr71 = ptrtoint double* %.8.fca.3.extract to i64 %sunkaddr72 = mul i64 %index, 8 %sunkaddr73 = add i64 %sunkaddr71, %sunkaddr72 %sunkaddr74 = add i64 %sunkaddr73, 96 %sunkaddr75 = inttoptr i64 %sunkaddr74 to <2 x double>* store <2 x double> %broadcast.splat5, <2 x double>* %sunkaddr75, align 8 %22 = add i64 %19, 4 %.sum8.3 = or i64 %22, 2 %23 = getelementptr double* %.8.fca.3.extract, i64 %.sum8.3 %24 = bitcast double* %23 to <2 x double>* store <2 x double> %broadcast.splat5, <2 x double>* %24, align 8 %25 = add i64 %22, 4 %26 = icmp eq i64 %25, %n.vec br i1 %26, label %middle.block, label %vector.body, !llvm.loop !2 middle.block: ; preds = %vector.ph.split, %vector.body, %overflow.checked %resume.val = phi i64 [ 0, %overflow.checked ], [ %n.vec, %vector.body ], [ %n.vec, %vector.ph.split ] %cmp.n = icmp eq i64 %.8.fca.4.0.extract, %resume.val br i1 %cmp.n, label %for.end, label %for.body.preheader for.body.preheader: ; preds = %middle.block, %for.body.lr.ph %loop.index2.ph = phi i64 [ 0, %for.body.lr.ph ], [ %resume.val, %middle.block ] %27 = sub i64 %.8.fca.4.0.extract, %loop.index2.ph %xtraiter = and i64 %27, 7 %lcmp.mod = icmp ne i64 %xtraiter, 0 %lcmp.overflow = icmp eq i64 %27, 0 %lcmp.or = or i1 %lcmp.overflow, %lcmp.mod br i1 %lcmp.or, label %for.body.prol.preheader, label %for.body.preheader.split for.body.prol.preheader: ; preds = %for.body.preheader %28 = add i64 %.8.fca.4.0.extract, -1 %29 = sub i64 %.8.fca.4.0.extract, %loop.index2.ph %30 = trunc i64 %29 to i3 %31 = zext i3 %30 to i64 %32 = sub i64 0, %31 br label %for.body.prol for.body.prol: ; preds = %for.body.prol.preheader, %for.body.prol %lsr.iv24 = phi i64 [ %32, %for.body.prol.preheader ], [ %lsr.iv.next25, %for.body.prol ] %loop.index2.prol = phi i64 [ %.44.prol, %for.body.prol ], [ %loop.index2.ph, %for.body.prol.preheader ] %scevgep23 = getelementptr double* %.8.fca.3.extract, i64 %loop.index2.prol store double %.39, double* %scevgep23, align 8 %.44.prol = add nuw nsw i64 %loop.index2.prol, 1 %lsr.iv.next25 = add i64 %lsr.iv24, 1 %prol.iter.cmp = icmp ne i64 %lsr.iv.next25, 0 br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.preheader.split, !llvm.loop !5 for.body.preheader.split: ; preds = %for.body.prol, %for.body.preheader %loop.index2.unr = phi i64 [ %loop.index2.ph, %for.body.preheader ], [ %.44.prol, %for.body.prol ] %33 = icmp ult i64 %27, 8 br i1 %33, label %for.end, label %for.body.preheader.split.split for.body.preheader.split.split: ; preds = %for.body.preheader.split %34 = sub i64 %.8.fca.4.0.extract, %loop.index2.unr %35 = add i64 %loop.index2.unr, 7 %scevgep = getelementptr double* %.8.fca.3.extract, i64 %35 br label %for.body for.body: ; preds = %for.body, %for.body.preheader.split.split %lsr.iv14 = phi double* [ %scevgep15, %for.body ], [ %scevgep, %for.body.preheader.split.split ] %lsr.iv = phi i64 [ %lsr.iv.next, %for.body ], [ %34, %for.body.preheader.split.split ] %scevgep22 = getelementptr double* %lsr.iv14, i64 -7 store double %.39, double* %scevgep22, align 8 %scevgep21 = getelementptr double* %lsr.iv14, i64 -6 store double %.39, double* %scevgep21, align 8 %scevgep20 = getelementptr double* %lsr.iv14, i64 -5 store double %.39, double* %scevgep20, align 8 %scevgep19 = getelementptr double* %lsr.iv14, i64 -4 store double %.39, double* %scevgep19, align 8 %scevgep18 = getelementptr double* %lsr.iv14, i64 -3 store double %.39, double* %scevgep18, align 8 %scevgep17 = getelementptr double* %lsr.iv14, i64 -2 store double %.39, double* %scevgep17, align 8 %scevgep16 = getelementptr double* %lsr.iv14, i64 -1 store double %.39, double* %scevgep16, align 8 store double %.39, double* %lsr.iv14, align 8 %lsr.iv.next = add i64 %lsr.iv, -8 %scevgep15 = getelementptr double* %lsr.iv14, i64 8 %exitcond.7 = icmp eq i64 %lsr.iv.next, 0 br i1 %exitcond.7, label %for.end, label %for.body, !llvm.loop !6 for.end: ; preds = %for.body.preheader.split, %for.body, %middle.block, %entry store i8* null, i8** %ret, align 8 ret i32 0 }
(same thing happens with "fcmp uno", by the way)
By the way, I was slightly wrong about my description of what the compiled function is supposed to do. It actually compares two scalars and "broadcasts" (copies) the result into a result array. That might seem silly but that's the kind of semantics we have to implement to support Numpy universal functions.
I'm not able to get the load folded into the 'andpd' using llc built from r227320 and the provided IR example: movsd LCPI0_0(%rip), %xmm0 <--- unaligned load is ok andpd %xmm1, %xmm0 Can you see if this reproduces with trunk or send the exact build instructions you're using for that IR?
Created attachment 13790 [details] IR reproducing issue
Ok, I attached the IR to reproduce the issue. Download it and feed it to: $ llc -O3 -code-model=default -relocation-model=default -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -mattr=-avx Note that "jitdefault" isn't recognized by llc, so the reproducer uses code model "default", which produces slightly different code, but still problematic: .align 8 .LCPI0_0: .quad 4607182418800017408 # double 1 [...] andpd .LCPI0_0(%rip), %xmm1 Removing "-mattr=-avx" produces yet another problematic code: vandpd .LCPI0_0(%rip), %xmm0, %xmm0 (this is still with 3.6rc1, not trunk)
(In reply to comment #6) > Removing "-mattr=-avx" produces yet another problematic code: > > vandpd .LCPI0_0(%rip), %xmm0, %xmm0 Thanks - the bug reproduces on trunk. There seems to be a hole in the codegen when you specify an AVX-capable CPU but then explicitly turn off AVX codegen. Unless I'm misunderstanding, the above AVX example will *not* generate an alignment exception. From the Intel ISA manual: "VEX-encoded instruction with 32-byte or 16-byte load semantics will support unaligned load operation by default." There is still a bug, however, because we're accessing 16-bytes of memory with that instruction, but the constant is only 8-bytes.
Minimized test case: define double @foo(double %x, double %y) { %cmp = fcmp oge double %x, %y %zext = zext i1 %cmp to i32 %conv = sitofp i32 %zext to double ret double %conv } $ ./llc 22371.ll -mattr=avx -o - ... LCPI0_0: .quad 4607182418800017408 ## double 1 ... vcmplesd %xmm0, %xmm1, %xmm0 vandpd LCPI0_0(%rip), %xmm0, %xmm0 <--- loading unknown memory retq
(In reply to comment #8) > $ ./llc 22371.ll -mattr=avx -o - I screwed that copy/paste up. The bug doesn't reproduce with just the AVX attr: $ ./llc 22371.ll -mattr=avx -o - ... vcmplesd %xmm0, %xmm1, %xmm0 vmovsd LCPI0_0(%rip), %xmm1 vandpd %xmm1, %xmm0, %xmm0 retq We apparently need to specify an AVX-capable CPU to trigger the load folding bug. For example: $ ./llc 22371.ll -mcpu=sandybridge -o - ... vcmplesd %xmm0, %xmm1, %xmm0 vandpd LCPI0_0(%rip), %xmm0, %xmm0 retq
Wow - there are bugs on top of bugs in the x86 instruction defs. This is one of those times when I wonder how LLVM could ever generate working code. :) 1. I think the simplest one is in the defs for "sse12_fp_alias_pack_logical"; these are using 32/64-bit memops instead of the required 128-bit memops. My hope is that fixing these will solve the codegen bug when AVX is enabled. 2. When you specify some AVX CPUs but no AVX codegen, you get a different bug: unaligned vector loads can be generated for SSE (non-VEX) instructions. This looks like a pervasive problem. Maybe it can be fixed in X86InstrFragmentsSIMD.td: def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return Subtarget->hasVectorUAMem() || cast<LoadSDNode>(N)->getAlignment() >= 16; }]>; ...but I'm not sure yet. 3. The feature flag above was added here: http://llvm.org/viewvc/llvm-project?view=revision&revision=224330 That commit only changed the Intel chips; AMD was left out in the cold. Ironically, this bug saves anyone who's generating code specifically for AMD from suffering bug #2, but it means that loads may not be getting folded optimally for some chunk of the ISA depending on the mess of instruction defs.
(btw, in case you're wondering, the reason we're disabling AVX is that we noticed it could produce slower code on Sandy Bridge and Ivy Bridge; and the reason Sandy Bridge is selected here is simply that it's my host CPU :-))
(In reply to comment #11) > (btw, in case you're wondering, the reason we're disabling AVX is that we > noticed it could produce slower code on Sandy Bridge and Ivy Bridge I would be very interested in investigating that problem if you can provide some kind of test case that I can run. In theory, AVX codegen should always be greater than or equal to SSE in perf.
I'll try to. If I do it, where should I post that? It's a bit off-topic for this issue.
Please open a new bug report for that one; you can cc or email me to make sure I don't miss it. Thanks!
I checked in a change to effectively revert r224330 here: http://reviews.llvm.org/rL227983
The defs for the FP logical ops were just updated here: http://reviews.llvm.org/rL228123 Sadly, this doesn't fix the load folding bug. But the good news is that after r227983, I haven't found a way to expose the bug without using the sse-unaligned-mem attribute: $ ./llc 22371.ll -mattr=sse-unaligned-mem -o - ... LCPI0_0: .quad 4607182418800017408 ## double 1 ... cmplesd %xmm0, %xmm1 andpd LCPI0_0(%rip), %xmm1 <--- load 128-bits from a 64-bit location movapd %xmm1, %xmm0 retq
Thanks very much for the changes. Will they land in the release_36 branch? I can't find them there (on the github mirror).
Yes - the 3.6 branch was just updated by Hans (thanks!) in r228323.
Great, I can confirm there are no crashes anymore in the Numba test suite (under Linux anyway).
Patch to not fold the wrong-sized load: http://reviews.llvm.org/D7474
Sanjay, should this still be considered a 3.6 blocker, or was r227983 enough to consider it unblocked? Or should I wait for http://reviews.llvm.org/D7474, and maybe more?
(In reply to comment #21) > Sanjay, should this still be considered a 3.6 blocker, or was r227983 enough > to consider it unblocked? > > Or should I wait for http://reviews.llvm.org/D7474, and maybe more? I think we've bypassed the serious danger; the part addressed by D7474 has existed for a long time, so I wouldn't consider this bug a blocker any more.
(In reply to comment #22) > (In reply to comment #21) > > Sanjay, should this still be considered a 3.6 blocker, or was r227983 enough > > to consider it unblocked? > > > > Or should I wait for http://reviews.llvm.org/D7474, and maybe more? > > I think we've bypassed the serious danger; the part addressed by D7474 has > existed for a long time, so I wouldn't consider this bug a blocker any more. OK, thanks.
Patch from D7474 is checked in with: http://reviews.llvm.org/rL229531 Along with a couple of patches to the test file because I angered some buildbots: http://reviews.llvm.org/rL229535 http://reviews.llvm.org/rL229563 Unfortunately, this doesn't completely solve the problem of folding a wrong-sized load into an FP logical instruction. But the good news is that I think it's difficult to impossible to currently generate code that could misbehave from this particular corner of the x86 backend.
Could this issue be the cause of this wine crash: "https://bugs.winehq.org/show_bug.cgi?id=38826" ? I compiled with both clang-3.6 and clang-3.7 on OSX Yosemite (installed using macports). The function that crashes looks like this with clang-3.6: (lldb) dis -n MSVCRT_fabs math.o`MSVCRT_fabs: math.o[0xcf6] <+0>: pushl %ebp math.o[0xcf7] <+1>: movl %esp, %ebp math.o[0xcf9] <+3>: andl $-0x10, %esp math.o[0xcfc] <+6>: subl $0x10, %esp math.o[0xcff] <+9>: calll 0xd04 ; <+14> at math.c:856 math.o[0xd04] <+14>: popl %eax math.o[0xd05] <+15>: movsd 0x8(%ebp), %xmm0 math.o[0xd0a] <+20>: andpd 0xad0c(%eax), %xmm0 math.o[0xd12] <+28>: movsd %xmm0, (%esp) math.o[0xd17] <+33>: fldl (%esp) math.o[0xd1a] <+36>: movl %ebp, %esp math.o[0xd1c] <+38>: popl %ebp math.o[0xd1d] <+39>: retl ... and like that with clang-3.7: (lldb) dis -n MSVCRT_fabs math.o`MSVCRT_fabs: math.o[0xd0e] <+0>: pushl %ebp math.o[0xd0f] <+1>: movl %esp, %ebp math.o[0xd11] <+3>: andl $-0x10, %esp math.o[0xd14] <+6>: subl $0x10, %esp math.o[0xd17] <+9>: calll 0xd1c ; <+14> at math.c:856 math.o[0xd1c] <+14>: popl %eax math.o[0xd1d] <+15>: movsd 0x29b4(%eax), %xmm0 math.o[0xd25] <+23>: andpd 0x8(%ebp), %xmm0 math.o[0xd2a] <+28>: movsd %xmm0, (%esp) math.o[0xd2f] <+33>: fldl (%esp) math.o[0xd32] <+36>: movl %ebp, %esp math.o[0xd34] <+38>: popl %ebp math.o[0xd35] <+39>: retl The source file is "dlls/msvcrt/math.c" in wine's source tree. Please let me know if you need more information. Thanks!
(In reply to comment #25) > Could this issue be the cause of this wine crash: > "https://bugs.winehq.org/show_bug.cgi?id=38826" ? It certainly looks suspicious. Could you please provide a reduced test case (in C or LLVM IR) and the compiler flags that you're building with? Thanks!
(In reply to comment #26) > It certainly looks suspicious. Could you please provide a reduced test case > (in C or LLVM IR) and the compiler flags that you're building with? Thanks! Never mind; I see that the trigger is compiling for 32-bit (i386): $ cat fabs.ll target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" target triple = "i386-apple-macosx10.10.0" define double @bad_fabs(double %x) { %call = tail call double @fabs(double %x) #0 ret double %call } declare double @fabs(double) #0 attributes #0 = { nounwind readnone } $ ./llc -mattr=sse2 fabs.ll -o - ... _bad_fabs: ## @bad_fabs .cfi_startproc ## BB#0: subl $12, %esp Ltmp0: .cfi_def_cfa_offset 16 movsd LCPI0_0, %xmm0 andpd 16(%esp), %xmm0 <--- illegal access of 16-bytes movsd %xmm0, (%esp) fldl (%esp) addl $12, %esp retl
It would be nice if this could get fixed for 3.7.
If we want to fix this immediately, we could just remove the wrong fold table entries where I left a FIXME from the previous change: Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp (revision 242925) +++ lib/Target/X86/X86InstrInfo.cpp (working copy) @@ -960,14 +960,14 @@ // instructions because the vector instructions require vector-sized // loads. Lowering should create vector-sized instructions (the Fv* // variants below) to allow load folding. - { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 }, - { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 }, - { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 }, - { X86::FsANDPSrr, X86::FsANDPSrm, TB_ALIGN_16 }, - { X86::FsORPDrr, X86::FsORPDrm, TB_ALIGN_16 }, - { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 }, - { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 }, - { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 }, +// { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 }, +// { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 }, +// { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 }, +// { X86::FsANDPSrr, X86::FsANDPSrm, TB_ALIGN_16 }, +// { X86::FsORPDrr, X86::FsORPDrm, TB_ALIGN_16 }, +// { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 }, +// { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 }, +// { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 }, ...but this will break 4 existing regression tests that are expecting load folding codegen for these types of instructions: LLVM :: CodeGen/X86/2008-02-06-LoadFoldingBug.ll LLVM :: CodeGen/X86/copysign-constant-magnitude.ll LLVM :: CodeGen/X86/pr2656.ll LLVM :: CodeGen/X86/sse-fcopysign.ll At least one of these (bug 2656) appears to be expecting the wrong codegen (a 16 byte load when we're accessing a smaller chunk of data). I think I see how to fix the others, but I need some time to code and test it all. I should get to it tomorrow.
Patch posted for review: http://reviews.llvm.org/D11477
I think we can finally declare this fixed: http://reviews.llvm.org/rL243361 And merged into 3.7 branch (thanks, Hans!): http://reviews.llvm.org/rL243435
The code generated now looks like this: (lldb) dis -n MSVCRT_fabs math.o`MSVCRT_fabs: math.o[0xd0e] <+0>: pushl %ebp math.o[0xd0f] <+1>: movl %esp, %ebp math.o[0xd11] <+3>: andl $-0x10, %esp math.o[0xd14] <+6>: subl $0x10, %esp math.o[0xd17] <+9>: calll 0xd1c ; <+14> at math.c:857 math.o[0xd1c] <+14>: popl %eax math.o[0xd1d] <+15>: movsd 0x8(%ebp), %xmm0 math.o[0xd22] <+20>: andpd 0x2874(%eax), %xmm0 math.o[0xd2a] <+28>: movlpd %xmm0, (%esp) math.o[0xd2f] <+33>: fldl (%esp) math.o[0xd32] <+36>: movl %ebp, %esp math.o[0xd34] <+38>: popl %ebp math.o[0xd35] <+39>: retl ...so, wine works again. Thank you very much!