3

Some codes like this (I'm not the author but I appreciate the work):

// 22 cycles per pixel mandelbrot (cascadelake)
#include <algorithm>
#include <cstdint>
#include <fstream>
#include <iostream>
#include <iterator>
#include <string_view>
#include <valarray>

namespace {

constexpr int kWidth = 2000;
constexpr int kHeight = 2000;
constexpr int kChunkSize = 256;

using Pixel = std::uint8_t;
using Image = std::array<Pixel, kWidth * kHeight>;
using FloatChunk = std::valarray<float>;
using IterChunk = std::valarray<Pixel>;

// Mapping of integer (x,y) to complex(Re,Im):
//  re = 3. * x / width - 2.25
//  im = (1.5 * height - 3. * y) / width

Image Mandelbrot(float x_scale, float y_scale, float x_mid, float y_mid) {
  std::valarray<Pixel> image_temp(kWidth*kHeight + kChunkSize - 1);
  FloatChunk mb_c_re(kChunkSize);
  FloatChunk mb_c_im(kChunkSize);
  FloatChunk mb_z_re(kChunkSize);
  FloatChunk mb_z_im(kChunkSize);
  IterChunk mb_i(kChunkSize);
  IterChunk ones(kChunkSize);
  ones = 1;
  const int pix_offset = (kHeight - kHeight / 2 + 1) * kWidth - kWidth / 2;

  // Don't bother worrying about the seam at the right.
  // Overcompute and overwrite.
  for (int y = kHeight / 2 - kHeight; y < kHeight / 2; ++y) {
    mb_c_im = y_scale * y + y_mid;
    for (int x = kWidth / 2 - kWidth; x < kWidth / 2; x += kChunkSize) {
      for (int xx = 0; xx < kChunkSize; ++xx) {
        mb_c_re[xx] = x_scale * (x + xx) + x_mid;
      }
      mb_z_re = 0.f;
      mb_z_im = 0.f;
      mb_i = 0;

      for (int i = 0; i < 35; ++i) {
        const auto mb_z_re_2 = mb_z_re * mb_z_re;
        const auto mb_z_im_2 = mb_z_im * mb_z_im;
        const auto mb_z_norm = mb_z_re_2 + mb_z_im_2;
        const FloatChunk n_mb_z_re = mb_z_re_2 - mb_z_im_2 + mb_c_re;
        const FloatChunk n_mb_z_im = 2.0f * mb_z_re * mb_z_im + mb_c_im;
        const auto mask = mb_z_norm < 4.0f;
        mb_i[mask] += ones[mask];

        mb_z_re = n_mb_z_re;
        mb_z_im = n_mb_z_im;
//      mb_z_re[mask] = n_mb_z_re[mask];
//      mb_z_im[mask] = n_mb_z_im[mask];
        if (!mask.sum()) break;
      }
  
      mb_i[mb_i >= 34] = 0;
      mb_i = mb_i * 7 + (mb_i * 3) / 4;
  
      image_temp[std::slice(y*kWidth + x + pix_offset, kChunkSize, 1)] = mb_i;
    }
  }
  
  Image image{};
  std::copy(std::begin(image_temp), std::begin(image_temp) + kWidth * kHeight,
            image.begin()); 
  return image;
}     

}  // namespace

#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86intrin.h>
#endif

// optional wrapper if you don't want to just use __rdtsc() everywhere
inline
uint64_t readTSC() {
    // _mm_lfence();  // optionally wait for earlier insns to retire before reading the clock
    uint64_t tsc = __rdtsc();
    // _mm_lfence();  // optionally block later instructions until rdtsc retires
    return tsc;
}

int main() {
  auto t1 = readTSC();
  auto image = Mandelbrot(kWidth / 3, kWidth / 3, -3.0f / 4, 0);
  auto t2 = readTSC();  
  std::cout << (t2 - t1)/(kWidth*kHeight) << " cycles per pixel" << std::endl;

  // write to file at once
  std::ofstream fout;
  fout.open("mandelbrot.ppm");
  if (fout.is_open()) {
    fout << "P5\n" << kWidth << ' ' << kHeight << " 255\n"
         << std::string_view(reinterpret_cast<char *>(image.data()), image.size());
    fout.close();
  } else {
    std::cout << "Could not open the file!\n";
  }

  return 0;
}

have no explicit vectorization so are relatively more readable yet they retain good portion of the speedup gained from explicit vectorization by intrinsics or GNU vector extensions.

Are there plans for future C++ versions to have more importance for std::valarray (so it approaches explicit intrinsic usage performance)? What about structs made of multiple numbers (like std::complex)? Will std::valarray support them?

huseyin tugrul buyukisik
  • 11,469
  • 4
  • 45
  • 97
  • 4
    `std::valarray` is almost dead and will probably remain so. – Evg Apr 21 '22 at 08:50
  • @Evg did you mean "to be deprecated" dead or just "not used much" dead? – huseyin tugrul buyukisik Apr 21 '22 at 08:51
  • 4
    "Not used much" dead. It's an old beast, it might be hard to deprecate it. – Evg Apr 21 '22 at 08:52
  • 1
    afaik `std::val_array` predates expression templates. Since there are expression templates `std::val_array` isnt that attractive anymore. – 463035818_is_not_an_ai Apr 21 '22 at 08:57
  • 1
    So you mean, If there is a matrix-multiplication job, one can write a constexpr expression template and fold 1024x1024 matrix into submatrices until 2x2 size is reached (at compile-time) and do the math without using temporary arrays, instead of simply using valarray and placing the 3-nested for loops? – huseyin tugrul buyukisik Apr 21 '22 at 09:04
  • If its older than "register" keyword, then there is strong probability of survival for the valarray right? – huseyin tugrul buyukisik Apr 21 '22 at 09:06
  • 4
    From what I understand of the history, valarray was mainly the work of a single indiviual, who then had a change of employment that included a change of focus. Very little work has been done on valarray since. – BoP Apr 21 '22 at 09:13
  • offtopic: It is strange you did `#include `, but you didn't use `std::complex` template and you have those ugly: `mb_c_re` `mb_c_im`. – Marek R Apr 21 '22 at 09:13
  • @MarekR ok, the complex header removed. mb_c_re means mandelbrot's complex-variable's real-part and the mb_c_im means mandelbrot complex imaginary part. Sometimes variable names get too long for what they represent.) – huseyin tugrul buyukisik Apr 21 '22 at 09:44
  • More like I'm surprised you didn't use `std::complex`. – Marek R Apr 21 '22 at 09:45
  • std::valarray does not accept std::complex to be used like that (I am not the author of this code, but I'd separate the variables too). – huseyin tugrul buyukisik Apr 21 '22 at 09:53
  • 1
    many people had the same question as yours and no one got the answer yet: [Why is valarray so slow?](https://stackoverflow.com/q/6850807/995714), [Why is valarray so slow on Visual Studio 2015?](https://stackoverflow.com/q/56050322/995714), https://www.quora.com/Why-does-nobody-seem-to-use-std-valarray/answer/Daniel-N%C3%A4slund – phuclv Apr 21 '22 at 09:58
  • Those are for specific compilers and about users. I'm asking for the creators of C++ and all compilers. Because users won't start using it unless it is optimized right? – huseyin tugrul buyukisik Apr 21 '22 at 10:00
  • 3
    Maybe have a look at `Eigen::Array`: http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html – chtz Apr 21 '22 at 10:36
  • Yes, as a header-only library, Eigen is awesome. But still it is not out-of-box available by gcc (I dont know if msvc or clang does) right? – huseyin tugrul buyukisik Apr 21 '22 at 11:49
  • 1
    I don’t believe the _standard library_ of the general-purpose language should include things only useful to small minority of users, and can be implemented just fine in third-party libraries. If they start adding HPC stuff from Eigen/BLAS/LAPACK, it’s very unclear where to stop. – Soonts Apr 21 '22 at 14:43

0 Answers0