illegal avx instruction while specifying native on a non-avx processor

Question

Here is the code causing my issue:

struct rte_mempool *
rte_mempool_xmem_create(const char *name, unsigned n, unsigned elt_size,
        unsigned cache_size, unsigned private_data_size,
        rte_mempool_ctor_t *mp_init, void *mp_init_arg,
        rte_mempool_obj_ctor_t *obj_init, void *obj_init_arg,
        int socket_id, unsigned flags, void *vaddr,
        const phys_addr_t paddr[], uint32_t pg_num, uint32_t pg_shift)
{
    char mz_name[RTE_MEMZONE_NAMESIZE];
    char rg_name[RTE_RING_NAMESIZE];
    struct rte_mempool_list *mempool_list;
    struct rte_mempool *mp = NULL;
    struct rte_tailq_entry *te;
    struct rte_ring *r;
    const struct rte_memzone *mz;
    size_t mempool_size;
    int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY;
    int rg_flags = 0;
    void *obj;
    struct rte_mempool_objsz objsz;
    void *startaddr;
    int page_size = getpagesize();

    /* compilation-time checks */
    RTE_BUILD_BUG_ON((sizeof(struct rte_mempool) &
              RTE_CACHE_LINE_MASK) != 0);
#if RTE_MEMPOOL_CACHE_MAX_SIZE > 0
    RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_cache) &
              RTE_CACHE_LINE_MASK) != 0);
    RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, local_cache) &
              RTE_CACHE_LINE_MASK) != 0);
#endif
#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
    RTE_BUILD_BUG_ON((sizeof(struct rte_mempool_debug_stats) &
              RTE_CACHE_LINE_MASK) != 0);
    RTE_BUILD_BUG_ON((offsetof(struct rte_mempool, stats) &
              RTE_CACHE_LINE_MASK) != 0);
#endif

    mempool_list = RTE_TAILQ_CAST(rte_mempool_tailq.head, rte_mempool_list);

    /* asked cache too big */
    if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE) {
        rte_errno = EINVAL;
        return NULL;
    }

    /* check that we have both VA and PA */
    if (vaddr != NULL && paddr == NULL) {
        rte_errno = EINVAL;
        return NULL;
    }

    /* Check that pg_num and pg_shift parameters are valid. */
    if (pg_num < RTE_DIM(mp->elt_pa) || pg_shift > MEMPOOL_PG_SHIFT_MAX) {
        rte_errno = EINVAL;
        return NULL;
    }

    /* "no cache align" imply "no spread" */
    if (flags & MEMPOOL_F_NO_CACHE_ALIGN)
        flags |= MEMPOOL_F_NO_SPREAD;

    /* ring flags */
    if (flags & MEMPOOL_F_SP_PUT)
        rg_flags |= RING_F_SP_ENQ;
    if (flags & MEMPOOL_F_SC_GET)
        rg_flags |= RING_F_SC_DEQ;

    /* calculate mempool object sizes. */
    if (!rte_mempool_calc_obj_size(elt_size, flags, &objsz)) {
        rte_errno = EINVAL;
        return NULL;
    }

    rte_rwlock_write_lock(RTE_EAL_MEMPOOL_RWLOCK);

    /* allocate the ring that will be used to store objects */
    /* Ring functions will return appropriate errors if we are
     * running as a secondary process etc., so no checks made
     * in this function for that condition */
    snprintf(rg_name, sizeof(rg_name), RTE_MEMPOOL_MZ_FORMAT, name);
    r = rte_ring_create(rg_name, rte_align32pow2(n+1), socket_id, rg_flags);
    if (r == NULL)
        goto exit;

    /*
     * reserve a memory zone for this mempool: private data is
     * cache-aligned
     */
    private_data_size = (private_data_size +
                 RTE_CACHE_LINE_MASK) & (~RTE_CACHE_LINE_MASK);

    if (! rte_eal_has_hugepages()) {
        /*
         * expand private data size to a whole page, so that the
         * first pool element will start on a new standard page
         */
        int head = sizeof(struct rte_mempool);
        int new_size = (private_data_size + head) % page_size;
        if (new_size) {
            private_data_size += page_size - new_size;
        }
    }

    /* try to allocate tailq entry */
    te = rte_zmalloc("MEMPOOL_TAILQ_ENTRY", sizeof(*te), 0);
    if (te == NULL) {
        RTE_LOG(ERR, MEMPOOL, "Cannot allocate tailq entry!\n");
        goto exit;
    }

    /*
     * If user provided an external memory buffer, then use it to
     * store mempool objects. Otherwise reserve memzone big enough to
     * hold mempool header and metadata plus mempool objects.
     */
    mempool_size = MEMPOOL_HEADER_SIZE(mp, pg_num) + private_data_size;
    if (vaddr == NULL)
        mempool_size += (size_t)objsz.total_size * n;

    if (! rte_eal_has_hugepages()) {
        /*
         * we want the memory pool to start on a page boundary,
         * because pool elements crossing page boundaries would
         * result in discontiguous physical addresses
         */
        mempool_size += page_size;
    }

    snprintf(mz_name, sizeof(mz_name), RTE_MEMPOOL_MZ_FORMAT, name);

    mz = rte_memzone_reserve(mz_name, mempool_size, socket_id, mz_flags);
    /*
     * no more memory: in this case we loose previously reserved
     * space for the as we cannot free it
     */
    if (mz == NULL) {
        rte_free(te);
        goto exit;
    }

    if (rte_eal_has_hugepages()) {
        startaddr = (void*)mz->addr;
    } else {
        /* align memory pool start address on a page boundary */
        unsigned long addr = (unsigned long)mz->addr;
        if (addr & (page_size - 1)) {
            addr += page_size;
            addr &= ~(page_size - 1);
        }
        startaddr = (void*)addr;
    }

    /* init the mempool structure */
    mp = startaddr;
    memset(mp, 0, sizeof(*mp));
    snprintf(mp->name, sizeof(mp->name), "%s", name);
    mp->phys_addr = mz->phys_addr;
    mp->ring = r;
    mp->size = n;
    mp->flags = flags;
    mp->elt_size = objsz.elt_size;
    mp->header_size = objsz.header_size;
    mp->trailer_size = objsz.trailer_size;
    mp->cache_size = cache_size;
    mp->cache_flushthresh = (uint32_t)
 ------>    (cache_size * CACHE_FLUSHTHRESH_MULTIPLIER);
    mp->private_data_size = private_data_size;

    /* calculate address of the first element for continuous mempool. */
    obj = (char *)mp + MEMPOOL_HEADER_SIZE(mp, pg_num) +
        private_data_size;

    /* populate address translation fields. */
    mp->pg_num = pg_num;
    mp->pg_shift = pg_shift;
    mp->pg_mask = RTE_LEN2MASK(mp->pg_shift, typeof(mp->pg_mask));

    /* mempool elements allocated together with mempool */
    if (vaddr == NULL) {
        mp->elt_va_start = (uintptr_t)obj;
        mp->elt_pa[0] = mp->phys_addr +
            (mp->elt_va_start - (uintptr_t)mp);

    /* mempool elements in a separate chunk of memory. */
    } else {
        mp->elt_va_start = (uintptr_t)vaddr;
        memcpy(mp->elt_pa, paddr, sizeof (mp->elt_pa[0]) * pg_num);
    }

    mp->elt_va_end = mp->elt_va_start;

    /* call the initializer */
    if (mp_init)
        mp_init(mp, mp_init_arg);

    mempool_populate(mp, n, 1, obj_init, obj_init_arg);

    te->data = (void *) mp;

    rte_rwlock_write_lock(RTE_EAL_TAILQ_RWLOCK);
    TAILQ_INSERT_TAIL(mempool_list, te, next);
    rte_rwlock_write_unlock(RTE_EAL_TAILQ_RWLOCK);

exit:
    rte_rwlock_write_unlock(RTE_EAL_MEMPOOL_RWLOCK);

    return mp;
}

cat /proc/cpuinfo produces:

flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts mmx fxsr sse sse2 ss syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts nopl xtopology tsc_reliable nonstop_tsc aperfmperf pni pclmulqdq ssse3 cx16 sse4_1 sse4_2 x2apic popcnt aes hypervisor lahf_lm ida arat epb pln pts dts

No AVX here.

My compilation line is:

gcc -Wp,-MD,./.rte_mempool.o.d.tmp -m64 -pthread  -march=native -DRTE_MACHINE_CPUFLAG_SSE -DRTE_MACHINE_CPUFLAG_SSE2 -DRTE_MACHINE_CPUFLAG_SSE3 -DRTE_MACHINE_CPUFLAG_SSSE3 -DRTE_MACHINE_CPUFLAG_SSE4_1 -DRTE_MACHINE_CPUFLAG_SSE4_2 -DRTE_MACHINE_CPUFLAG_AES -DRTE_MACHINE_CPUFLAG_PCLMULQDQ -DRTE_COMPILE_TIME_CPUFLAGS=RTE_CPUFLAG_SSE,RTE_CPUFLAG_SSE2,RTE_CPUFLAG_SSE3,RTE_CPUFLAG_SSSE3,RTE_CPUFLAG_SSE4_1,RTE_CPUFLAG_SSE4_2,RTE_CPUFLAG_AES,RTE_CPUFLAG_PCLMULQDQ  -I/home/dpdk/sources/dpdk/DPDK-2.0.0/x86_64-vm-gcc/include -include /home/dpdk/sources/dpdk/DPDK-2.0.0/x86_64-vm-gcc/include/rte_config.h -W -Wall -Werror -Wstrict-prototypes -Wmissing-prototypes -Wmissing-declarations -Wold-style-definition -Wpointer-arith -Wcast-align -Wnested-externs -Wcast-qual -Wformat-nonliteral -Wformat-security -Wundef -Wwrite-strings -I/home/dpdk/sources/dpdk/DPDK-2.0.0/lib/librte_mempool -g -O3  -g -fPIC -o rte_mempool.o -c /home/dpdk/sources/dpdk/DPDK-2.0.0/lib/librte_mempool/rte_mempool.c

Clearly, native compilation is specified.

My gcc version is gcc (GCC) 4.6.3 20120306 (Red Hat 4.6.3-2).

gcc reports the following as to the 'native' architecture selections:

gcc -### -E - -march=native 2>&1 | sed -r '/cc1/!d;s/(")|(^.* - )//g'
-march=corei7 -mcx16 -msahf -mno-movbe -maes -mpclmul -mpopcnt -mno-abm -mno-lwp -mno-fma -mno-fma4 -mno-xop -mno-bmi -mno-tbm -mno-avx -msse4.2 -msse4.1 --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=20480 -mtune=corei7

The code crashes with Illegal Instruction on:

   0x00000000005e49f6 <+598>:   callq  0x40b370 <memset@plt>
   0x00000000005e49fb <+603>:   mov    0x30(%rsp),%rcx
   0x00000000005e4a00 <+608>:   lea    0x283c00(%rip),%rdx        # 0x868607
   0x00000000005e4a07 <+615>:   mov    $0x20,%esi
   0x00000000005e4a0c <+620>:   mov    %rbx,%rdi
   0x00000000005e4a0f <+623>:   xor    %eax,%eax
   0x00000000005e4a11 <+625>:   callq  0x40be40 <snprintf@plt>
=> 0x00000000005e4a16 <+630>:   vcvtsi2sd %r13,%xmm0,%xmm0
   0x00000000005e4a1b <+635>:   mov    0x28(%rsp),%r8
   0x00000000005e4a20 <+640>:   mov    0xd0(%rsp),%edx
   0x00000000005e4a27 <+647>:   mov    0x50(%rsp),%rcx
   0x00000000005e4a2c <+652>:   mov    0x20(%r8),%rax
   0x00000000005e4a30 <+656>:   mov    %ebp,0x34(%rbx)
   0x00000000005e4a33 <+659>:   mov    %edx,0x40(%rbx)

Why is vcvtsi2sd used on a non-avx machine where gcc promises to not use avx instructions?

`-mtune=corei7` - are there any Core-i7 **without** AVX? Why don't you explicitly specify the target, but rely on the default setting of gcc? — too honest for this site, Aug 20 '16 at 19:03
@Olaf From the gcc-manual: `'corei7' Intel Core i7 CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1 and SSE4.2 instruction set support. 'corei7-avx' Intel Core i7 CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AES and PCLMUL instruction set support.` — EOF, Aug 20 '16 at 19:10
Could you post a minimal example of code that produces this behavior? — EOF, Aug 20 '16 at 19:13
because that is the meaning of the native arch indication. i expect gcc to do as it promises: use the native instruction set. I'll specify the corei7 as you suggest, though, to see whether that would make a difference. I am using DPDK, which is Intel's networking stack, so I am dependent on their set of makefiles. hopefully i can make this work. — howling cat, Aug 20 '16 at 19:16
@EOF: Thanks, but I actually asked about the CPU, not the compiler. The question was meant seriously — too honest for this site, Aug 20 '16 at 19:16
EOF, the code would not help you any, but here it is: `565│ mp->header_size = objsz.header_size;` `566│ mp->trailer_size = objsz.trailer_size;` `567│ mp->cache_size = cache_size;` `568│ mp->cache_flushthresh = (uint32_t)` `569├> (cache_size * CACHE_FLUSHTHRESH_MULTIPLIER);` `570│ mp->private_data_size = private_data_size;` `571│` `572│ /* calculate address of the first element for continuous mempool. */573│ obj = (char *)mp + MEMPOOL_HEADER_SIZE(mp, pg_num) +` `574│ private_data_size;` — howling cat, Aug 20 '16 at 19:18
@howlingcat That clearly is neither a complete nor verifiable example. In particular it's clearly not the code corresponding to the assembly you've posted. Also, add the code to the question, you can edit it. — EOF, Aug 20 '16 at 19:26
@Olaf the i5/i7 naming was already in use with Nehalem, AVX was introduced 3 years later in SandyBridge. — harold, Aug 20 '16 at 19:26
@Olaf, I used corei7 as the architecture flag, and it worked. This is very unfortunate, though: we are developing on vm clusters, and its easy to simply specify 'native' instead of having to check the actual processor every time. I am still curious as to what the reason for this is. — howling cat, Aug 20 '16 at 19:41
@EOF: Do you want to see a longer snippet of the code? How do you think that would help you? I'd like to understand. — howling cat, Aug 20 '16 at 19:41
Are you sure you did a clean rebuild, and didn't have any `.o`s compiled with different options? `-march=native` should avoid using AVX on a CPU that doesn't support it. Your `gcc | sed` output shows `-mno-avx`. So unless your crusty old version of gcc from 2012 has a bug, it shouldn't be using AVX. Also, what CPU do you have? Is it one that might fool an AVX-detection function? like a Haswell Pentium (no AVX support on Pentium/Celeron, only i3/i5/i7)? Or a virtual machine which doesn't pass through AVX, on hardware which does support AVX? — Peter Cordes, Aug 20 '16 at 19:42
@howlingcat: EOF wants you to [edit] the code into your question, where it can be formatted properly, and you can include a [mcve]. The mess you pasted into a comment obviously won't compile, since it's not a whole function, and it has line numbers mixed in. That will help because then other people can try to reproduce this with the same compiler and options, or other compilers on http://gcc.godbolt..org/ — Peter Cordes, Aug 20 '16 at 19:44
@PeterCordes, wouldn't the VM's cpuinfo show the physical CPU capabilities? It does not show avx. gcc also promises not to produce avx instructions, yet it does. — howling cat, Aug 20 '16 at 19:56
@EOF, I added the code to my original post. I'll try adding the entire file. It's open-source. — howling cat, Aug 20 '16 at 19:56
@howlingcat: No, `CPUINFO` would usually be intercepted by the VM, and show only the feature bits the VM is prepared to support. e.g. disable AVX to allow migration of the VM to a physical CPU that doesn't support AVX. Presumably this is why your /proc/cpuinfo doesn't show AVX (because Linux used CPUID at boot time). But if something in gcc erroneously based its decision on the CPU model name/number, instead of the actual feature bits, it might generate AVX code. Or maybe the VM migrated from hardware without AVX, and is now running on HW with AVX, and the VM is passing it through? — Peter Cordes, Aug 20 '16 at 20:17
Try running a command like `x86info`, which runs CPUID instead of reading /proc/cpuinfo. However, the gcc output you showed does say `-mno-avx`, so this doesn't really make sense. Have you tried a newer gcc, or clang, on your same VM setup? gcc 4.6 is pretty ancient. 5.4 is the current stable, and will do a better job auto-vectorizing / optimizing for current CPUs (`-mtune=haswell`). — Peter Cordes, Aug 20 '16 at 20:22
Looks like this bug: https://bugs.launchpad.net/ubuntu/+source/gcc-4.6/+bug/1288935 — Severin Pappadeux, Aug 21 '16 at 01:23
@SeverinPappadeux: Maybe, but note that that bug report is marked as invalid because the VM is advertising AVX support. I suspect this might have happened here; with a VM migrating from a non-AVX CPU to an AVX CPU. So OS support for AVX is not enabled ([the OSXSAVE bit](http://stackoverflow.com/a/34071400/224132)), meaning AVX instructions will fault. If gcc only looks at the AVX feature bit, not also the OSXSAVE feature bit in CPUID, that could be the problem. Do all FP instructions use `v`-prefixed AVX instructions? Or just the convert insn? — Peter Cordes, Aug 21 '16 at 02:45
@PeterCordes, here is the CPU Model information generated by x86info: CPU Model (x86info's best guess): Core i7 (Nehalem) [Clarkdale/Arrandale]. — howling cat, Aug 22 '16 at 05:31
@howlingcat: It shouldn't need to guess. What's the actual version string? Something like `Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz` will be programmed into the silicon, in ASCII, where x86info can get it with the CPUID instruction. See https://software.intel.com/en-us/articles/intel-architecture-and-processor-identification-with-cpuid-model-and-family-numbers. (It could well be a Nehalem, but the model number and model name string would confirm it.) Also, does x86info say the AVX feature bit is enabled or not? I was guessing that maybe your VM now supports AVX, but didn't at boot — Peter Cordes, Aug 22 '16 at 05:43
Here is x86info's output: x86info x86info v1.29. Dave Jones 2001-2011 Feedback to . Found 4 identical CPUs Extended Family: 0 Extended Model: 2 Family: 6 Model: 37 Stepping: 1 Type: 0 (Original OEM) CPU Model (x86info's best guess): Core i7 (Nehalem) [Clarkdale/Arrandale] Processor name string (BIOS programmed): Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz Total processor threads: 4 This system has 1 quad-core processor running at an estimated 2.40GHz — howling cat, Aug 22 '16 at 11:42
"E5-2630 v3" is Haswell. Your old version of x86info apparently doesn't know about anything newer than Nehalem. — Peter Cordes, Apr 02 '23 at 19:28

illegal avx instruction while specifying native on a non-avx processor

0 Answers0