Capture and encode desktop with libav in real time not giving corect images

Question

As part of a larger project I want to be able to capture and encode the desktop frame by frame in real time. I have the following test code to reproduce the issue shown in the screenshot:

#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <string>
#include <string.h>
#include <math.h>

extern "C"
{
#include "libavdevice/avdevice.h"
#include "libavutil/channel_layout.h"
#include "libavutil/mathematics.h"
#include "libavutil/opt.h"
#include "libavformat/avformat.h"
#include "libswscale/swscale.h"
}


/* 5 seconds stream duration */
#define STREAM_DURATION   5.0
#define STREAM_FRAME_RATE 25 /* 25 images/s */
#define STREAM_NB_FRAMES  ((int)(STREAM_DURATION * STREAM_FRAME_RATE))
#define STREAM_PIX_FMT    AV_PIX_FMT_YUV420P /* default pix_fmt */

int videoStreamIndx;
int framerate = 30;

int width = 1920;
int height = 1080;

int encPacketCounter;

AVFormatContext* ifmtCtx;
AVCodecContext* avcodecContx;
AVFormatContext* ofmtCtx;
AVStream* videoStream;
AVCodecContext* avCntxOut;
AVPacket* avPkt;
AVFrame* avFrame;
AVFrame* outFrame;
SwsContext* swsCtx;

std::ofstream fs;


AVDictionary* ConfigureScreenCapture()
{

    AVDictionary* options = NULL;
    //Try adding "-rtbufsize 100M" as in https://stackoverflow.com/questions/6766333/capture-windows-screen-with-ffmpeg
    av_dict_set(&options, "rtbufsize", "100M", 0);
    av_dict_set(&options, "framerate", std::to_string(framerate).c_str(), 0);
    char buffer[16];
    sprintf(buffer, "%dx%d", width, height);
    av_dict_set(&options, "video_size", buffer, 0);
    return options;
}

AVCodecParameters* ConfigureAvCodec()
{
    AVCodecParameters* av_codec_par_out = avcodec_parameters_alloc();
    av_codec_par_out->width = width;
    av_codec_par_out->height = height;
    av_codec_par_out->bit_rate = 40000;
    av_codec_par_out->codec_id = AV_CODEC_ID_H264; //AV_CODEC_ID_MPEG4; //Try H.264 instead of MPEG4
    av_codec_par_out->codec_type = AVMEDIA_TYPE_VIDEO;
    av_codec_par_out->format = 0;
    return av_codec_par_out;
}

int GetVideoStreamIndex()
{
    int VideoStreamIndx = -1;
    avformat_find_stream_info(ifmtCtx, NULL);
    /* find the first video stream index . Also there is an API available to do the below operations */
    for (int i = 0; i < (int)ifmtCtx->nb_streams; i++) // find video stream position/index.
    {
        if (ifmtCtx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO)
        {
            VideoStreamIndx = i;
            break;
        }
    }

    if (VideoStreamIndx == -1)
    {
    }

    return VideoStreamIndx;
}

void CreateFrames(AVCodecParameters* av_codec_par_in, AVCodecParameters* av_codec_par_out)
{

    avFrame = av_frame_alloc();
    avFrame->width = avcodecContx->width;
    avFrame->height = avcodecContx->height;
    avFrame->format = av_codec_par_in->format;
    av_frame_get_buffer(avFrame, 0);

    outFrame = av_frame_alloc();
    outFrame->width = avCntxOut->width;
    outFrame->height = avCntxOut->height;
    outFrame->format = av_codec_par_out->format;
    av_frame_get_buffer(outFrame, 0);
}

bool Init()
{
    AVCodecParameters* avCodecParOut = ConfigureAvCodec();

    AVDictionary* options = ConfigureScreenCapture();

    AVInputFormat* ifmt = av_find_input_format("gdigrab");
    auto ifmtCtxLocal = avformat_alloc_context();
    if (avformat_open_input(&ifmtCtxLocal, "desktop", ifmt, &options) < 0)
    {
        return false;
    }
    ifmtCtx = ifmtCtxLocal;

    videoStreamIndx = GetVideoStreamIndex();

    AVCodecParameters* avCodecParIn = avcodec_parameters_alloc();
    avCodecParIn = ifmtCtx->streams[videoStreamIndx]->codecpar;

    AVCodec* avCodec = avcodec_find_decoder(avCodecParIn->codec_id);
    if (avCodec == NULL)
    {
        return false;
    }

    avcodecContx = avcodec_alloc_context3(avCodec);
    if (avcodec_parameters_to_context(avcodecContx, avCodecParIn) < 0)
    {
        return false;
    }

    //av_dict_set
    int value = avcodec_open2(avcodecContx, avCodec, NULL); //Initialize the AVCodecContext to use the given AVCodec.
    if (value < 0)
    {
        return false;
    }

    AVOutputFormat* ofmt = av_guess_format("h264", NULL, NULL);

    if (ofmt == NULL)
    {
        return false;
    }

    auto ofmtCtxLocal = avformat_alloc_context();
    avformat_alloc_output_context2(&ofmtCtxLocal, ofmt, NULL, NULL);
    if (ofmtCtxLocal == NULL)
    {
        return false;
    }
    ofmtCtx = ofmtCtxLocal;

    AVCodec* avCodecOut = avcodec_find_encoder(avCodecParOut->codec_id);
    if (avCodecOut == NULL)
    {
        return false;
    }

    videoStream = avformat_new_stream(ofmtCtx, avCodecOut);
    if (videoStream == NULL)
    {
        return false;
    }

    avCntxOut = avcodec_alloc_context3(avCodecOut);
    if (avCntxOut == NULL)
    {
        return false;
    }

    if (avcodec_parameters_copy(videoStream->codecpar, avCodecParOut) < 0)
    {
        return false;
    }

    if (avcodec_parameters_to_context(avCntxOut, avCodecParOut) < 0)
    {
        return false;
    }

    avCntxOut->gop_size = 30; //3; //Use I-Frame frame every 30 frames.
    avCntxOut->max_b_frames = 0;
    avCntxOut->time_base.num = 1;
    avCntxOut->time_base.den = framerate;

    //avio_open(&ofmtCtx->pb, "", AVIO_FLAG_READ_WRITE);

    if (avformat_write_header(ofmtCtx, NULL) < 0)
    {
        return false;
    }

    value = avcodec_open2(avCntxOut, avCodecOut, NULL); //Initialize the AVCodecContext to use the given AVCodec.
    if (value < 0)
    {
        return false;
    }

    if (avcodecContx->codec_id == AV_CODEC_ID_H264)
    {
        av_opt_set(avCntxOut->priv_data, "preset", "ultrafast", 0);
        av_opt_set(avCntxOut->priv_data, "zerolatency", "1", 0);
        av_opt_set(avCntxOut->priv_data, "tune", "ull", 0);
    }

    if ((ofmtCtx->oformat->flags & AVFMT_GLOBALHEADER) != 0)
    {
        avCntxOut->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
    }

    CreateFrames(avCodecParIn, avCodecParOut);

    swsCtx = sws_alloc_context();
    if (sws_init_context(swsCtx, NULL, NULL) < 0)
    {
        return false;
    }

    swsCtx = sws_getContext(avcodecContx->width, avcodecContx->height, avcodecContx->pix_fmt,
        avCntxOut->width, avCntxOut->height, avCntxOut->pix_fmt, SWS_FAST_BILINEAR,
        NULL, NULL, NULL);
    if (swsCtx == NULL)
    {
        return false;
    }

    return true;
}

void Encode(AVCodecContext* enc_ctx, AVFrame* frame, AVPacket* pkt)
{
    int ret;

    /* send the frame to the encoder */
    ret = avcodec_send_frame(enc_ctx, frame);
    if (ret < 0)
    {
        return;
    }

    while (ret >= 0)
    {
        ret = avcodec_receive_packet(enc_ctx, pkt);
        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
            return;
        if (ret < 0)
        {
            return;
        }

        fs.write((char*)pkt->data, pkt->size);
        av_packet_unref(pkt);
    }
}

void EncodeFrames(int noFrames)
{
    int frameCount = 0;
    avPkt = av_packet_alloc();
    AVPacket* outPacket = av_packet_alloc();
    encPacketCounter = 0;

    while (av_read_frame(ifmtCtx, avPkt) >= 0)
    {
        if (frameCount++ == noFrames)
            break;
        if (avPkt->stream_index != videoStreamIndx) continue;

        avcodec_send_packet(avcodecContx, avPkt);

        if (avcodec_receive_frame(avcodecContx, avFrame) >= 0) // Frame successfully decoded :)
        {
            outPacket->data = NULL; // packet data will be allocated by the encoder
            outPacket->size = 0;

            outPacket->pts = av_rescale_q(encPacketCounter, avCntxOut->time_base, videoStream->time_base);
            if (outPacket->dts != AV_NOPTS_VALUE)
                outPacket->dts = av_rescale_q(encPacketCounter, avCntxOut->time_base, videoStream->time_base);

            outPacket->dts = av_rescale_q(encPacketCounter, avCntxOut->time_base, videoStream->time_base);
            outPacket->duration = av_rescale_q(1, avCntxOut->time_base, videoStream->time_base);

            outFrame->pts = av_rescale_q(encPacketCounter, avCntxOut->time_base, videoStream->time_base);
            outFrame->pkt_duration = av_rescale_q(encPacketCounter, avCntxOut->time_base, videoStream->time_base);
            encPacketCounter++;

            int sts = sws_scale(swsCtx,
                avFrame->data, avFrame->linesize, 0, avFrame->height,
                outFrame->data, outFrame->linesize);

            /* make sure the frame data is writable */
            auto ret = av_frame_make_writable(outFrame);
            if (ret < 0)
                break;
            Encode(avCntxOut, outFrame, outPacket);
        }
        av_frame_unref(avFrame);
        av_packet_unref(avPkt);
    }
}

void Dispose()
{
    fs.close();

    auto ifmtCtxLocal = ifmtCtx;
    avformat_close_input(&ifmtCtx);
    avformat_free_context(ifmtCtx);
    avcodec_free_context(&avcodecContx);

}

int main(int argc, char** argv)
{
    avdevice_register_all();

    fs.open("out.h264");

    if (Init())
    {
        EncodeFrames(300);
    }
    else
    {
        std::cout << "Failed to Init \n";
    }    

    Dispose();

    return 0;
}

As far as I can tell the setup of the encoding process is correct as it is largely unchanged from how the example given in the official documentation is working: https://libav.org/documentation/doxygen/master/encode__video_8c_source.html

However there is limited documentation around the desktop capture online so I am not sure if I have set that up correctly.

This is not a solution to the problem directly but a comment on the sample code. I notice that you open a output format context but then don't use it, if you are just dumping the raw packets to file (or passing them to another non-ffmpeg component) then it is wasted effort to open a format context that you don't use, you can instead just open the codec context directly. — staircase27, Sep 02 '22 at 11:09

score 1 · Accepted Answer · answered Sep 01 '22 at 21:24

We have to open the out.h264 as binary file.

Replace fs.open("out.h264"); with fs.open("out.h264", std::ios::binary);.

The default file type in Windows is "text file".
That means that each \n in converted to \r\n when writing, and the encoded stream get "messed up".

It took me quite a long time to figure out the problem...

There is another small issue:
There is a missing loop at the end, that flushes the remaining encoded packets.

We can use FFprobe for counting the number of encoded frames:

ffprobe -v error -select_streams v:0 -count_frames -show_entries stream=nb_read_frames -print_format csv out.h264

The result is 263 instead of 300.

The solution is adding the following loop at the end of void EncodeFrames(int noFrames) function:

int ret = 0;
avcodec_send_frame(avCntxOut, NULL);
do
{
    av_packet_unref(outPacket);
    ret = avcodec_receive_packet(avCntxOut, outPacket);
    if (!ret)
    {
        fs.write((char*)outPacket->data, outPacket->size);
    }
} while (!ret);

Hi Rotem, thanks for the answer, do you have any idea what might be causing the packets to pile up in the first place? For the example given here I write to a file for testing, however I want to be able to send each frame over the web in the final product so I dont want them to pile up and need flushing, but instead for them to send as each frame is ready. — thoxey, Sep 02 '22 at 08:26
That is because the code `av_opt_set(avCntxOut->priv_data, "zerolatency", "1", 0);` is not executed. Use the debugger to see why. Without "tuning" the default latency is relatively long. — Rotem, Sep 02 '22 at 08:31

wolverin · Answer 2 · 2022-09-03T07:43:44.343

0

This is not a solution to the problem directly but maybe so?

AVDictionary * pDic = NULL;
av_dict_set(&pDic, "tune", "zerolatency", 0);
avcodec_open2(avCntxOut, avCodecOut, &pDic);

edited Sep 03 '22 at 07:43

answered Sep 03 '22 at 06:38

wolverin

1
1

Capture and encode desktop with libav in real time not giving corect images

2 Answers2