0

I have an intel i5 (NUC) system running ubuntu, and I am trying to develop a realtime application with point clouds.

I have a for loop of 307200 (640x480) iterations with some conditions. The code snippet is:

void pc_segmentation(pcl::PointCloud<pcl::PointXYZRGB>::Ptr input_cloud,Eigen::Vector4f pt_min,Eigen::Vector4f pt_max)
{
    pcl::PointCloud<pcl::PointXYZRGB> slice_0;
    pcl::PointCloud<pcl::PointXYZRGB> slice_1;
    pcl::PointCloud<pcl::PointXYZRGB> slice_2;
    pcl::PointCloud<pcl::PointXYZRGB> slice_3;
    pcl::PointCloud<pcl::PointXYZRGB> slice_4;
    pcl::PointCloud<pcl::PointXYZRGB> slice_5;
    pcl::PointCloud<pcl::PointXYZRGB> slice_z;
    pcl::PointCloud<pcl::PointXYZRGB> slice_y(input_cloud->width,input_cloud->height);
    pcl::PointCloud<pcl::PointXYZRGB> slice_x(input_cloud->width,input_cloud->height);

    //clearing pointclouds
    slice_0.clear();
    slice_1.clear();
    slice_2.clear();
    slice_3.clear();
    slice_4.clear();
    slice_5.clear();
    slice_z.clear();

    float xmin0=100, xmin1=100, xmin2=100, xmin3=100, xmin4=100, xmin5=100;
    float ymin0=100, ymin1=100, ymin2=100, ymin3=100, ymin4=100, ymin5=100;
    float xmax0=-100, xmax1=-100, xmax2=-100, xmax3=-100, xmax4=-100, xmax5=-100;
    float ymax0=-100, ymax1=-100, ymax2=-100, ymax3=-100, ymax4=-100, ymax5=-100;


    //Slices for Y direction
    Eigen::VectorXf slice_pts_y(NO_OF_SLICES);
    Eigen::VectorXf diff_y(NO_OF_SLICES);
    Eigen::VectorXf slice_pts_x(NO_OF_SLICES_X);
    Eigen::VectorXf diff_x(NO_OF_SLICES_X);

    //Slice Vector Y
    double dist_btw_slice = (pt_max[1] - pt_min[1])/NO_OF_SLICES;
    for(int i=1; i<=NO_OF_SLICES;i++)
        slice_pts_y[i-1] = pt_min[1] + (i*dist_btw_slice);

    //Slice Vector X
    dist_btw_slice = (pt_max[0] - pt_min[0])/NO_OF_SLICES_X;
    for(int i=1; i<=NO_OF_SLICES_X;i++)
        slice_pts_x[i-1] = pt_min[1] + (i*dist_btw_slice);

    ros::Time start_for_loop_time = ros::Time::now();
    for(int ii=0;ii<input_cloud->width;ii++)
        for(int jj=0;jj<input_cloud->height;jj++)
        {
            pcl::PointXYZRGB point = input_cloud->at(ii,jj);
            if((pcl_isfinite(point.z)))
            {
                if((point.z > 0.9)&&(point.z < 1))
                {
                    slice_0.push_back(point);
                }
                else if ((point.z > 1.4)&&(point.z < 1.5))
                {
                    slice_1.push_back(point);
                }
                else if((point.z > 1.9)&&(point.z < 2))
                {
                    slice_2.push_back(point);
                }
                else if ((point.z > 2.4)&&(point.z < 2.5))
                {
                    slice_3.push_back(point);
                }
                else if((point.z > 2.9)&&(point.z < 3))
                {
                    slice_4.push_back(point);
                }
                else if ((point.z > 3.4)&&(point.z < 3.5))
                {
                    slice_5.push_back(point);
                }
            }
            if((pcl_isfinite(point.y)))
            {
                diff_y = slice_pts_y - (point.y * Eigen::VectorXf::Ones(NO_OF_SLICES));
                diff_y=diff_y.array().abs();
                std::sort(diff_y.data(),diff_y.data()+NO_OF_SLICES,std::less<float>());
                if(diff_y[0]<0.005)
                {
                    slice_y.at(ii,jj)=point;
                }
            }
            if((pcl_isfinite(point.x)))
            {
                diff_x = slice_pts_x - (point.x * Eigen::VectorXf::Ones(NO_OF_SLICES_X));
                diff_x=diff_x.array().abs();
                std::sort(diff_x.data(),diff_x.data()+NO_OF_SLICES_X,std::less<float>());
                if(diff_x[0]<0.005)
                {
                    slice_x.at(ii,jj)=point;
                }
            }
        }
    ros::Time end_for_loop_time = ros::Time::now();
    if(seg_pcl_time.is_open())
        seg_pcl_time<<end_for_loop_time.toSec() - start_for_loop_time.toSec()<<std::endl;
    else
        std::cout<<"Segmented Point Cloud for Z slices "<<end_for_loop_time.toSec() - start_for_loop_time.toSec()<<" seconds"<<std::endl;
    ros::Time start_pc_add_time = ros::Time::now();
    slice_z = slice_0 + slice_1;
    slice_z += slice_2;
    slice_z += slice_3;
    slice_z += slice_4;
    slice_z += slice_5;
    pcl::copyPointCloud(slice_z,slices_z_pc); //slices_z_pc is defined globally
    ros::Time end_pc_add_time = ros::Time::now();
    std::cout<<"Adding and Publishing Point Cloud for Z slices "<<end_pc_add_time.toSec() - start_pc_add_time.toSec()<<" seconds"<<std::endl;
    // Publish point clouds
    slices_z_pc.header = input_cloud->header;
    slice_x.header = input_cloud->header;
    slice_y.header = input_cloud->header;
    pub_x_slices.publish(slice_x);
    pub_y_slices.publish(slice_y);
    pub_z_slices.publish(slices_z_pc);
}

NO_OF_SLICES = 20 and NO_OF_SLICES_X = 20

This approximately takes 100ms - 110ms on an i5 system and 340ms on Odroid (octacore ARM processor) running ubuntu. I feel that this processing time is significantly large.

I am not sure how I can optimize it to run faster. I have also enable "O3" optimization while compiling. Any help is greatly appreciated.

PS: I have also tried looping it through the 3D array example point_cloud[640][480][3] but the processing time is slightly lesser (~20ms).

Edit: Changed code from snippet of the for loop to entire function as suggested by @Toby Speight

spacemanspiff
  • 113
  • 12
  • 1
    Have you tried profiler? Removing parts of code (i.e. profiling manually)? Are you use that it's not `sort` which takes all the time? – yeputons Feb 07 '17 at 02:34
  • 1
    It's bound to be slow. You are doing a whole lots of computation and possible memory reallocations... and sorting stuff... twice... per loop And your ifs seem quite random in their order killing the branch predictior. – luk32 Feb 07 '17 at 02:41
  • @yeputons and @luk32 , I use a sort their to find the minimum value in the vector. I tried to do vector computations, in order to make it fast instead of (20 if conditions). Would using `std::min_element` better comparatively? I wanna find the minimum value in the array and check if its below a threshold. Thanks – spacemanspiff Feb 07 '17 at 03:08
  • @luk32: Could you kindly elaborate on branch predictor and how would you put the ifs. Thanks – spacemanspiff Feb 07 '17 at 03:10
  • 1
    I would sort the original vector by `z`, and check if it improves results, as the [most famous question of SO](http://stackoverflow.com/questions/11227809/why-is-it-faster-to-process-a-sorted-array-than-an-unsorted-array) suggests. And split those three loops into three loops. x, z, and y parts are all doing different things. `min` will be faster than `sort` quite obviously. I think Eigen has internal reduction for `min`. There is also a `Constant` type of matrices, so what you are doing is `(slice_pts_x-VectorXf::Constant(NO_OF_SLICES, point.y).cwiseAbs().minCoeff();` – luk32 Feb 07 '17 at 03:31
  • @user3785280 I believe `std::min_element` will do much better than calling `std::sort` and taking the first element, just because it always does fewer operations. – yeputons Feb 07 '17 at 03:59
  • Try cachegrind to monitor your cache misses and optimize accordingly – aerkenemesis Feb 07 '17 at 11:11
  • Your code is incomplete; in particular, it seems to be missing declarations for the `slice_*` variables and at least one `#include`. Your statements should be in a function, too. Please [edit] your code so it's a [mcve] of your problem, then we can try to reproduce and solve it. You should also read [ask]. – Toby Speight Feb 07 '17 at 15:48
  • The amount of if statements there is guaranteeing that you are missing every single branch prediction that your CPU is making. – fwg Feb 14 '17 at 11:54

0 Answers0