0

C++ Code

void Mesh::RecalculateNormals()
{
        for (int i = 0; i < indexCount; i += 3)
        {
            const int ia = indices[i];
            const int ib = indices[i + 1];
            const int ic = indices[i + 2];

            const glm::vec3 e1 = glm::vec3(vert[ia].position) - glm::vec3(vert[ib].position);
            const glm::vec3 e2 = glm::vec3(vert[ic].position) - glm::vec3(vert[ib].position);
            const glm::vec3 no = cross(e1, e2);

            vert[ia].normal += glm::vec4(no, 0.0);
            vert[ib].normal += glm::vec4(no, 0.0);
            vert[ic].normal += glm::vec4(no, 0.0);
        }

        for (int i = 0; i < vertexCount; i++)
            vert[i].normal = glm::vec4(glm::normalize(glm::vec3(vert[i].normal)), 0.0f);

}

New Code with SIMD

// From :  https://geometrian.com/programming/tutorials/cross-product/index.php
[[nodiscard]] inline static __m128 cross_product(__m128 const& vec0, __m128 const& vec1) {
    __m128 tmp0 = _mm_shuffle_ps(vec0, vec0, _MM_SHUFFLE(3, 0, 2, 1));
    __m128 tmp1 = _mm_shuffle_ps(vec1, vec1, _MM_SHUFFLE(3, 1, 0, 2));
    __m128 tmp2 = _mm_mul_ps(tmp0, vec1);
    __m128 tmp3 = _mm_mul_ps(tmp0, tmp1);
    __m128 tmp4 = _mm_shuffle_ps(tmp2, tmp2, _MM_SHUFFLE(3, 0, 2, 1));
    return _mm_sub_ps(tmp3, tmp4);
}

void normalize(const glm::vec4& lpInput, glm::vec4& lpOutput) {
    const __m128& vInput = reinterpret_cast<const __m128&>(lpInput); // load input vector (x, y, z, a)
    __m128 vSquared = _mm_mul_ps(vInput, vInput); // square the input values
    __m128 vHalfSum = _mm_hadd_ps(vSquared, vSquared);
    __m128 vSum = _mm_hadd_ps(vHalfSum, vHalfSum); // compute the sum of values
    float fInvSqrt; _mm_store_ss(&fInvSqrt, _mm_rsqrt_ss(vSum)); // compute the inverse sqrt
    __m128 vNormalized = _mm_mul_ps(vInput, _mm_set1_ps(fInvSqrt)); // normalize the input vector
    lpOutput = reinterpret_cast<const glm::vec4&>(vNormalized); // store normalized vector (x, y, z, a)
}

void Mesh::RecalculateNormals()
{
        float result[4];
        glm::vec4 tmp;
        for (int i = 0; i < indexCount; i += 3)
        {
            const int ia = indices[i];
            const int ib = indices[i + 1];
            const int ic = indices[i + 2];

            __m128 iav = _mm_set_ps(vert[ia].position.x, vert[ia].position.y, vert[ia].position.z, 0.0f);
            __m128 ibv = _mm_set_ps(vert[ib].position.x, vert[ib].position.y, vert[ib].position.z, 0.0f);
            __m128 icv = _mm_set_ps(vert[ic].position.x, vert[ic].position.y, vert[ic].position.z, 0.0f);

            __m128 e1i = _mm_sub_ps(iav, ibv);
            __m128 e2i = _mm_sub_ps(icv, ibv);

            //const glm::vec3 e1 = glm::vec3(vert[ia].position) - glm::vec3(vert[ib].position);
            //const glm::vec3 e2 = glm::vec3(vert[ic].position) - glm::vec3(vert[ib].position);
            //const glm::vec3 no = cross(e1, e2);

            __m128 no = cross_product(e1i, e2i);

            //vert[ia].normal += glm::vec4(no, 0.0);
            //vert[ib].normal += glm::vec4(no, 0.0);
            //vert[ic].normal += glm::vec4(no, 0.0);
            _mm_storeu_ps(result, no);
            tmp = glm::make_vec4(result);
            vert[ia].normal += tmp;
            vert[ib].normal += tmp;
            vert[ic].normal += tmp;
        }

        for (int i = 0; i < vertexCount; i++)
            vert[i].normal = glm::vec4(glm::normalize(glm::vec3(vert[i].normal)), 0.0f);

}

But its not working. Please can anyone help finding the problems.

(I am very new to SIMD)

Jaysmito Mukherjee
  • 1,467
  • 2
  • 10
  • 29
  • 3
    For situations like this I would normally step through the code in a debugger, with some known input values, comparing results with expectations. It helps if you have a known good reference implementation for comparing intermediate/final results. – Paul R Feb 06 '22 at 10:17
  • 4
    Not a correctness problem, but for performance `_mm_store_ss` / `_mm_set1_ps` seems very weird; 2x `haddps` already ([inefficiently](https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026)) leaves the result broadcasted to all 4 elements of the vector, and if you did need to broadcast the low element you would just use `_mm_shuffle_ps(v,v, _MM_SHUFFLE(0,0,0,0))`. A smart compiler would optimize away the _mm_store_ss and not actually store anything to stack space, but some compilers take intrinsics literally. – Peter Cordes Feb 06 '22 at 10:42
  • 1
    Be aware that `_mm_set_ps` uses big-endian notation, i.e., if you do `iav = _mm_set_ps(x,y,z,0)`, then `iav[0] = 0, ..., iav[3] = x`. – chtz Feb 06 '22 at 13:12
  • 1
    Note that the reinterpret cast assume that the values are aligned in memory. I think it is safer to use `_mm_loadu_ps` if they are not assumed to be aligned. Otherwise this is fine. Moreover, why do you store the temporary result to `fInvSqrt` while this is not required? – Jérôme Richard Feb 06 '22 at 16:44

1 Answers1

3

Your correctness problem is probably caused by Intel screwing up the order of their _mm_set_ps and similar intrinsics. To create a vector with x, y, z floats in the first 3 lanes, write either _mm_set_ps( 0, z, y, x ) or _mm_setr_ps( x, y, z, 0 ).

Here’s better primitive functions you gonna need for that. That code assumes you have at least SSE 4.1, that thing is introduced in the Intel Core 2 in 2007, and according to Steam survey the market penetration is 98.89% by now.

// Load an FP32 3D vector from memory
inline __m128 loadFloat3( const glm::vec3& pos )
{
    static_assert( sizeof( glm::vec3 ) == 12, "Expected to be 12 bytes (3 floats)" );

    __m128 low = _mm_castpd_ps( _mm_load_sd( (const double*)&pos ) );
    // Modern compilers are optimizing the following 2 intrinsics into a single insertps with a memory operand
    __m128 high = _mm_load_ss( ( (const float*)&pos ) + 2 );
    return _mm_insert_ps( low, high, 0x27 );
}

// Store an FP32 3D vector to memory
inline void storeFloat3( glm::vec3& pos, __m128 vec )
{
    _mm_store_sd( (double*)&pos, _mm_castps_pd( vec ) );
    ( (int*)( &pos ) )[ 2 ] = _mm_extract_ps( vec, 2 );
}

// Zero out W lane, and store an FP32 vector to memory
inline void storeFloat4( glm::vec4& pos, __m128 vec )
{
    static_assert( sizeof( glm::vec4 ) == 16, "Expected to be 16 bytes" );
    vec = _mm_insert_ps( vec, vec, 0b1000 );
    _mm_storeu_ps( (float*)&pos, vec );
}

// Normalize a 3D vector; if the source is zero, will instead return a zero vector
inline __m128 vector3Normalize( __m128 vec )
{
    // Compute x^2 + y^2 + z^2, broadcast the result to all 4 lanes
    __m128 dp = _mm_dp_ps( vec, vec, 0b01111111 );
    // res = vec / sqrt( dp )
    __m128 res = _mm_div_ps( vec, _mm_sqrt_ps( dp ) );
    // Compare for dp > 0
    __m128 positiveLength = _mm_cmpgt_ps( dp, _mm_setzero_ps() );
    // Zero out the vector if the dot product was zero
    return _mm_and_ps( res, positiveLength );
}

// Copy-pasted from there: https://github.com/microsoft/DirectXMath/blob/jan2021/Inc/DirectXMathVector.inl#L9506-L9519
// MIT license
#ifdef __AVX__
#define XM_PERMUTE_PS( v, c ) _mm_permute_ps( v, c )
#else
#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c )
#endif

#ifdef __AVX2__
#define XM_FNMADD_PS( a, b, c ) _mm_fnmadd_ps((a), (b), (c))
#else
#define XM_FNMADD_PS( a, b, c ) _mm_sub_ps((c), _mm_mul_ps((a), (b)))
#endif

inline __m128 vector3Cross( __m128 V1, __m128 V2 )
{
    // y1,z1,x1,w1
    __m128 vTemp1 = XM_PERMUTE_PS( V1, _MM_SHUFFLE( 3, 0, 2, 1 ) );
    // z2,x2,y2,w2
    __m128 vTemp2 = XM_PERMUTE_PS( V2, _MM_SHUFFLE( 3, 1, 0, 2 ) );
    // Perform the left operation
    __m128 vResult = _mm_mul_ps( vTemp1, vTemp2 );
    // z1, x1, y1, w1
    vTemp1 = XM_PERMUTE_PS( vTemp1, _MM_SHUFFLE( 3, 0, 2, 1 ) );
    // y2, z2, x2, w2
    vTemp2 = XM_PERMUTE_PS( vTemp2, _MM_SHUFFLE( 3, 1, 0, 2 ) );
    // Perform the right operation
    vResult = XM_FNMADD_PS( vTemp1, vTemp2, vResult );
    return vResult;
}

You should not expect high quality results with that algorithm. It computes per-vertex normals by accumulating triangle normals weighted by triangle’s area. This means if your mesh has both small and large triangles, the normals of the small triangles will be ignored, which is suboptimal. A better way is weighting normals by triangle’s angles at vertices. See this article for more information.

Soonts
  • 20,079
  • 9
  • 57
  • 130
  • SSE4.1 was new in 2nd-gen Core2 (2007/2008 [Penryn and Wolfdale](https://en.wikipedia.org/wiki/Intel_Core_(microarchitecture)#Penryn/Wolfdale_(45_nm))), not 2006 (Merom/Conroe). Agreed it's probably something you can assume, but there are Core2 CPUs without it. The most recent CPUs without it are AMD, though: [Phenom II (launched in 2008, discontinued 2012)](https://en.wikipedia.org/wiki/Phenom_II) doesn't have SSSE3 either. (It does have SSE3, so it does have `dpps`.) – Peter Cordes Feb 06 '22 at 21:35
  • @PeterCordes Good catch, updated – Soonts Feb 06 '22 at 21:47
  • @Soonts My meshes here will have same size triangles 95% of time and also I do not want very precise normal calc a approximate calc will do . One problem I have with this is all my normals are stored in vec4(vec4 align easily in memory causing less problems when passing to GPU) where only first 3 values are usefyle 4th one is garbage so is thare a way I can just pass a vec4(but using only 1st 3 values) in these functions rather than vec3 as vec3 to vec4 will slow things down – Jaysmito Mukherjee Feb 07 '22 at 03:27
  • 1
    @JaysmitoMukherjee Have you actually tested that? I would expect on modern hardware, both CPUs and GPUs, RAM bandwidth to be more expensive than things not being aligned by 16 bytes. If you did and it helps, use `_mm_loadu_ps` intrinsic to load the normals, and `storeFloat4` (just updated) to store them back. – Soonts Feb 07 '22 at 11:02
  • 1
    One more thing, instead of using the operator `+=` in the main loop, you should load all 3 per-vertex normals, update them with `_mm_add_ps` adding the cross product, than store back. – Soonts Feb 07 '22 at 11:06