1

I am running this code (simplified for staying on point):

int main(int argc, char** argv) {

    char* videoPath = args[1];

    DataSource* dataSource;
    std::thread dataSourceThread;
    dataSource = new FileDataSource(0.5f, true);
    dataSourceThread = std::thread(&FileDataSource::start, (FileDataSource*)dataSource, videoPath, 50);


    logger->info("FileDataSource started");
    float invScale = 1.f / dataSource->getScale();

    cv::Mat frame;
    unsigned long bufferIndexPrev = 0;
    unsigned long t;

    cv::Mat projMat(3, 4, CV_32F);
    cv::Mat rVec, tVec;

    while (!dataSource->isRunning()) std::this_thread::yield();
    logger->info("Data source running");

    Tracker tracker;
    tracker.dataSource = dataSource;
    std::thread trackerThread(&Tracker::start, &tracker);

    while (tracker.stateBuffer == nullptr) std::this_thread::yield();
    logger->info("Tracker started");

    while (true) {
        int bufferIndex = tracker.stateBuffer->bufferIndex;
        if (bufferIndex == bufferIndexPrev) {
            if (dataSource->isRunning()) {
                // std::this_thread::sleep_for(std::chrono::milliseconds(1));
                std::this_thread::yield();
                continue;
            }
            else {
                break;
            }
        }
        bufferIndexPrev = bufferIndex;


        State& state = tracker.stateBuffer->buffer[bufferIndex];
        Data data = state.data;

        int lastProcessedFrameNum = tracker.stateBuffer->buffer[bufferIndex].dataPacket.frameNumber;

        dataSource->getOriginalFrameByFrameNum(lastProcessedFrameNum, frame);

        cv::putText(frame, "Processing", cv::Point(20, 30), CV_FONT_HERSHEY_PLAIN, 2, CV_RGB(255, 0, 0)); 
     // ^^^^^^^^^ this line causes the crash
        cv::putText(frame, std::to_string(lastProcessedFrameNum), cv::Point(200, 30), CV_FONT_HERSHEY_PLAIN, 2, CV_RGB(255, 255, 0));

        std::vector<cv::KeyPoint> keypoints = state.keypoints;
        std::vector<int> indices = state.indices;
        for (int i = 0; i < keypoints.size(); i++) {
            cv::Point2f detectedPoint = keypoints[i].pt * invScale;
            if (state.indices[i] >= 0)
                cv::drawMarker(frame, detectedPoint, CV_RGB(0, 255, 0), cv::MARKER_CROSS);
            else 
                cv::drawMarker(frame, detectedPoint, CV_RGB(255, 0, 0), cv::MARKER_CROSS);


        }


        cv::imshow("Video", frame);

        int key = cv::waitKey(10);
        if (key == 32) {
            tracker.nextPhase();
        }

    }

    logger->info("Exited main loop");

    if (trackerThread.joinable()) trackerThread.join();
    if (dataSourceThread.joinable()) dataSourceThread.join();

    return 0;
}

Now, the program sometimes crashes with Segmentation Fault (SIGSEGV) on

cv::putText(frame, "Processing", cv::Point(20, 30), CV_FONT_HERSHEY_PLAIN, 2, CV_RGB(255, 0, 0));

I debug it using GDB. The way I see it, the only thing that can be causing problems is the matrix object frame. I can also confirm this, since the code never crashes if I place another instance of cv::Mat instead of frame.

The data source is running its own loop of fetching video frames and putting them inside a circular buffer. The main program takes a video frame from that buffer when it needs one.

void FileDataSource::start(char* fileVideoPath, int fps) {

    logger->info("FileDataSource opening FileVideo {}", fileVideoPath);
    FileVideo video(fileVideoPath);

    unsigned long period = 1.e+6 / fps; // microseconds
    std::chrono::microseconds periodDuration(period);

    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
    while (true) {
        int nextBufferIndex = (bufferIndex + 1) % bufferSize;
        cv::Mat& originalFrame = originalFrameBuffer[nextBufferIndex];
        bool success = video.getFrame(originalFrame);
        if (!success) break;

        cv::Mat& reducedFrame = reducedFrameBuffer[nextBufferIndex];
        cvtColor(originalFrame, reducedFrame, CV_BGR2GRAY);
        if (!this->presized) resize(reducedFrame, reducedFrame, cv::Size(), scale, scale);

        frameNumBuffer[nextBufferIndex] = frameNumBuffer[bufferIndex] + 1;
        tBuffer[nextBufferIndex] = tBuffer[bufferIndex] + period;
        bufferIndex = nextBufferIndex;
        this->running = true;
        std::this_thread::sleep_until(now + periodDuration);
        now = std::chrono::system_clock::now();
    }

    this->running = false;
}

void FileDataSource::getOriginalFrameByFrameNum(unsigned long frameNum, cv::Mat& frame) {

    int lBufferIndex = this->bufferIndex;
    for (int i = 0; i < bufferSize; i++)
    {
        if (frameNumBuffer[lBufferIndex] == frameNum) break;
        lBufferIndex = (lBufferIndex - 1 + bufferSize) % bufferSize;
    }

    frame = originalFrameBuffer[lBufferIndex];
}

The tracker neither reads or writes to the originalFrameBuffer, only reads from reducedFrameBuffer but this is not displayed in code since the post is large enough as it is.

From another SO question I learn about the nature of segmentation fault:

Segmentation fault is a specific kind of error caused by accessing memory that “does not belong to you.” It’s a helper mechanism that keeps you from corrupting the memory and introducing hard-to-debug memory bugs. Whenever you get a segfault you know you are doing something wrong with memory – accessing variable that has already been freed, writing to a read-only portion of the memory, etc.

I really don't see how this applies to my case. Any ideas?

The GDB backtrace shows:

Thread 1 "videoGTKExample" received signal SIGSEGV, Segmentation fault.
0x00007fffed095281 in malloc_consolidate () from /lib/libc.so.6
(gdb) bt
#0  0x00007fffed095281 in malloc_consolidate () at /lib/libc.so.6
#1  0x00007fffed096d2a in _int_malloc () at /lib/libc.so.6
#2  0x00007fffed098d44 in malloc () at /lib/libc.so.6
#3  0x00007fffed966a78 in operator new(unsigned long) (sz=8192) at /build/gcc-multilib/src/gcc/libstdc++-v3/libsupc++/new_op.cc:50
#4  0x00007fffeed3d684 in cv::putText(cv::_InputOutputArray const&, cv::String const&, cv::Point_<int>, int, double, cv::Scalar_<double>, int, int, bool) ()
    at /usr/local/lib/libopencv_imgproc.so.3.1
#5  0x00000000004a920b in main(int, char**) (argc=4, argv=0x7fffffffe0d8) at /home/andro/tracker/examples/videoGTKExample.cpp:180

P.S. If I run the code with valgrind, I cannot reproduce the error since it is executing much slower and the concurrency of threads no longer seems to be the problem.

UPDATE:

I've managed to reproduce the SIGSEGV crash while running via:

valgrind --tool=exp-sgcheck

The resulting log is here. I am not sure though that this is the same error since it occurred at a line where it is not occurring regularly. I am new to valgrind and do not know if there is anything useful in this log. All I see is a stacktrace, similar to what I see during normal gdb session.

UPDATE 2:

In gdb I also sometimes (rarely) get the following crash:

Thread 1 "videoGTKExample" received signal SIGABRT, Aborted.
0x00007fffed05104f in raise () from /lib/libc.so.6
(gdb) bt
#0  0x00007fffed05104f in raise () at /lib/libc.so.6
#1  0x00007fffed05247a in abort () at /lib/libc.so.6
#2  0x00007fffed08ec50 in __libc_message () at /lib/libc.so.6
#3  0x00007fffed094fe6 in malloc_printerr () at /lib/libc.so.6
#4  0x00007fffed09536c in malloc_consolidate () at /lib/libc.so.6
#5  0x00007fffed096d2a in _int_malloc () at /lib/libc.so.6
#6  0x00007fffed098d44 in malloc () at /lib/libc.so.6
#7  0x00007fffed966a78 in operator new(unsigned long) (sz=8192) at /build/gcc-multilib/src/gcc/libstdc++-v3/libsupc++/new_op.cc:50
#8  0x00007fffeed3d684 in cv::putText(cv::_InputOutputArray const&, cv::String const&, cv::Point_<int>, int, double, cv::Scalar_<double>, int, int, bool) ()
    at /usr/local/lib/libopencv_imgproc.so.3.1
#9  0x00000000004a951b in main(int, char**) (argc=4, argv=0x7fffffffe0d8) at /home/andro/stypevisualirtracker/examples/videoGTKExample.cpp:180

From this question:

abort() sends the calling process the SIGABRT signal, this is how abort() basically works.

abort() is usually called by library functions which detect an internal error or some seriously broken constraint. For example malloc() will call abort() if its internal structures are damaged by a heap overflow.

So this is most certainly caused by corrupted heap.

Community
  • 1
  • 1
AndroC
  • 4,758
  • 2
  • 46
  • 69
  • 4
    Improper use of threads (poor synchronisation causing race conditions etc) may be the cause here. – Jabberwocky Feb 03 '17 at 13:49
  • I would understand that this can occur if I want to draw on `frame` while datasource is writing into the same memory block. Then I assume strange things can happen. But the circular buffer ensures that the datasource is always writing in `cv::Mat`s that nobody has been using for some time... – AndroC Feb 03 '17 at 14:00
  • 1
    Possible duplicate of [Segfaults in malloc() and malloc\_consolidate()](http://stackoverflow.com/questions/3100193/segfaults-in-malloc-and-malloc-consolidate) – rustyx Feb 03 '17 at 14:01
  • 1
    If possible try to run the program with only one thread and see if the problem goes away. – Jabberwocky Feb 03 '17 at 14:03
  • @RustyX I have seen this question. I tried setting MALLOC_CHECK_ to all suggested values, nothing changes – AndroC Feb 03 '17 at 14:05
  • 2
    Malloc bugs are so much fun to debug. That's why I don't use Java or C#, they take away the fun. One line of code can corrupt the heap, but a different line of code will crash when you allocate the right size chunk that sends malloc to access the corrupted part of the heap. There are lots of tricks to find these bugs. – brian beuning Feb 03 '17 at 14:10
  • @MichaelWalz I'm pretty sure its due to multithreading because of its indeterminism. It crashes ~1 in 3 times and always at slightly different video frame. Will try your suggestion later, though... – AndroC Feb 03 '17 at 14:16
  • Try to run it single thread calling the async code instead of the while(running) loops. I'm quite sure the problem is in the thread synchronization, probably in the "isRunning" method (you should use at least an std::atomic variable for it). – gabry Feb 03 '17 at 14:23
  • 1
    Isn't this an older version of OpenCV? You could try running the same using OpenCV 3.x – Jeru Luke Feb 03 '17 at 14:51
  • I think the `resize` of `reducedFrame` is the culprit. Destination `Mat` is reused only when the [shape and type match](http://docs.opencv.org/2.4.10/modules/core/doc/basic_structures.html#mat-create). Since you change size, the Mat gets realloced. Next time you do `cvtColor` on it, it gets realoced again. Lack of synchro and sharing of the same `Mat` object in both threads causes you to eventually touch deallocced memory. – Dan Mašek Feb 03 '17 at 14:52
  • @JeruLuke this is OpenCV3.1 – AndroC Feb 03 '17 at 14:52
  • @DanMašek, sorry I didnt clean up the code very throroughly... the `resize` doesn't actually get called. (`presized == true`) – AndroC Feb 03 '17 at 14:55

0 Answers0