I have an intel i5 (NUC) system running ubuntu, and I am trying to develop a realtime application with point clouds.
I have a for loop of 307200 (640x480) iterations with some conditions. The code snippet is:
void pc_segmentation(pcl::PointCloud<pcl::PointXYZRGB>::Ptr input_cloud,Eigen::Vector4f pt_min,Eigen::Vector4f pt_max)
{
pcl::PointCloud<pcl::PointXYZRGB> slice_0;
pcl::PointCloud<pcl::PointXYZRGB> slice_1;
pcl::PointCloud<pcl::PointXYZRGB> slice_2;
pcl::PointCloud<pcl::PointXYZRGB> slice_3;
pcl::PointCloud<pcl::PointXYZRGB> slice_4;
pcl::PointCloud<pcl::PointXYZRGB> slice_5;
pcl::PointCloud<pcl::PointXYZRGB> slice_z;
pcl::PointCloud<pcl::PointXYZRGB> slice_y(input_cloud->width,input_cloud->height);
pcl::PointCloud<pcl::PointXYZRGB> slice_x(input_cloud->width,input_cloud->height);
//clearing pointclouds
slice_0.clear();
slice_1.clear();
slice_2.clear();
slice_3.clear();
slice_4.clear();
slice_5.clear();
slice_z.clear();
float xmin0=100, xmin1=100, xmin2=100, xmin3=100, xmin4=100, xmin5=100;
float ymin0=100, ymin1=100, ymin2=100, ymin3=100, ymin4=100, ymin5=100;
float xmax0=-100, xmax1=-100, xmax2=-100, xmax3=-100, xmax4=-100, xmax5=-100;
float ymax0=-100, ymax1=-100, ymax2=-100, ymax3=-100, ymax4=-100, ymax5=-100;
//Slices for Y direction
Eigen::VectorXf slice_pts_y(NO_OF_SLICES);
Eigen::VectorXf diff_y(NO_OF_SLICES);
Eigen::VectorXf slice_pts_x(NO_OF_SLICES_X);
Eigen::VectorXf diff_x(NO_OF_SLICES_X);
//Slice Vector Y
double dist_btw_slice = (pt_max[1] - pt_min[1])/NO_OF_SLICES;
for(int i=1; i<=NO_OF_SLICES;i++)
slice_pts_y[i-1] = pt_min[1] + (i*dist_btw_slice);
//Slice Vector X
dist_btw_slice = (pt_max[0] - pt_min[0])/NO_OF_SLICES_X;
for(int i=1; i<=NO_OF_SLICES_X;i++)
slice_pts_x[i-1] = pt_min[1] + (i*dist_btw_slice);
ros::Time start_for_loop_time = ros::Time::now();
for(int ii=0;ii<input_cloud->width;ii++)
for(int jj=0;jj<input_cloud->height;jj++)
{
pcl::PointXYZRGB point = input_cloud->at(ii,jj);
if((pcl_isfinite(point.z)))
{
if((point.z > 0.9)&&(point.z < 1))
{
slice_0.push_back(point);
}
else if ((point.z > 1.4)&&(point.z < 1.5))
{
slice_1.push_back(point);
}
else if((point.z > 1.9)&&(point.z < 2))
{
slice_2.push_back(point);
}
else if ((point.z > 2.4)&&(point.z < 2.5))
{
slice_3.push_back(point);
}
else if((point.z > 2.9)&&(point.z < 3))
{
slice_4.push_back(point);
}
else if ((point.z > 3.4)&&(point.z < 3.5))
{
slice_5.push_back(point);
}
}
if((pcl_isfinite(point.y)))
{
diff_y = slice_pts_y - (point.y * Eigen::VectorXf::Ones(NO_OF_SLICES));
diff_y=diff_y.array().abs();
std::sort(diff_y.data(),diff_y.data()+NO_OF_SLICES,std::less<float>());
if(diff_y[0]<0.005)
{
slice_y.at(ii,jj)=point;
}
}
if((pcl_isfinite(point.x)))
{
diff_x = slice_pts_x - (point.x * Eigen::VectorXf::Ones(NO_OF_SLICES_X));
diff_x=diff_x.array().abs();
std::sort(diff_x.data(),diff_x.data()+NO_OF_SLICES_X,std::less<float>());
if(diff_x[0]<0.005)
{
slice_x.at(ii,jj)=point;
}
}
}
ros::Time end_for_loop_time = ros::Time::now();
if(seg_pcl_time.is_open())
seg_pcl_time<<end_for_loop_time.toSec() - start_for_loop_time.toSec()<<std::endl;
else
std::cout<<"Segmented Point Cloud for Z slices "<<end_for_loop_time.toSec() - start_for_loop_time.toSec()<<" seconds"<<std::endl;
ros::Time start_pc_add_time = ros::Time::now();
slice_z = slice_0 + slice_1;
slice_z += slice_2;
slice_z += slice_3;
slice_z += slice_4;
slice_z += slice_5;
pcl::copyPointCloud(slice_z,slices_z_pc); //slices_z_pc is defined globally
ros::Time end_pc_add_time = ros::Time::now();
std::cout<<"Adding and Publishing Point Cloud for Z slices "<<end_pc_add_time.toSec() - start_pc_add_time.toSec()<<" seconds"<<std::endl;
// Publish point clouds
slices_z_pc.header = input_cloud->header;
slice_x.header = input_cloud->header;
slice_y.header = input_cloud->header;
pub_x_slices.publish(slice_x);
pub_y_slices.publish(slice_y);
pub_z_slices.publish(slices_z_pc);
}
NO_OF_SLICES = 20 and NO_OF_SLICES_X = 20
This approximately takes 100ms - 110ms on an i5 system and 340ms on Odroid (octacore ARM processor) running ubuntu. I feel that this processing time is significantly large.
I am not sure how I can optimize it to run faster. I have also enable "O3"
optimization while compiling. Any help is greatly appreciated.
PS: I have also tried looping it through the 3D array example point_cloud[640][480][3]
but the processing time is slightly lesser (~20ms).
Edit: Changed code from snippet of the for loop to entire function as suggested by @Toby Speight