Finally got it.
The idea is to draw the marker as white box on a black image. Then crop the image that we want and draw it in a new image. Since the correct size for the new image is unknown, we just set the size as square. The new image should be black image with white boxes at the corner. Starting from (0,0) we then cross the image and check for the pixel value. The pixel value should be white. If the pixel value is black, we are outside the white box. Trace back the pixel value along x and y because the white box might be tall or wide. Once we find the bottom right of the white box, we have the size of the white box. Rescale this white box to square. Use the same function to rescale the image.
This is the image captured by camera

Draw the marker as white box in a black image.

Crop and warped into a square.

Get the width and height of the white box in top left corner.
Once we have the scale function, apply it.

In case anyone interested, here are the codes.
// Get3dRectFrom2d.cpp : This file contains the 'main' function. Program execution begins and ends there.
//
#include "pch.h"
#include <iostream>
#include <opencv2/opencv.hpp>
#include <opencv2/aruco.hpp>
#define CAMERA_WINDOW "Simple ArUco"
using namespace std;
using namespace cv;
static bool readCameraParameters(string filename, Mat &camMatrix, Mat &distCoeffs) {
FileStorage fs(filename, FileStorage::READ);
if (!fs.isOpened())
return false;
fs["camera_matrix"] >> camMatrix;
fs["distortion_coefficients"] >> distCoeffs;
return true;
}
int main()
{
Mat camMatrix, distCoeffs;
string cameraSettings = "camera.txt";
bool estimatePose = false;
bool showRejected = true;
if (readCameraParameters(cameraSettings, camMatrix, distCoeffs))
{
estimatePose = true;
}
Ptr<aruco::Dictionary> dictionary =
aruco::getPredefinedDictionary(aruco::PREDEFINED_DICTIONARY_NAME(aruco::DICT_4X4_50));
Ptr<aruco::DetectorParameters> detectorParams = aruco::DetectorParameters::create();
float markerLength = 3.75f;
float markerSeparation = 0.5f;
double totalTime = 0;
int totalIterations = 0;
VideoCapture inputVideo(0);
if (!inputVideo.isOpened())
{
cout << "cannot open camera";
}
double prevW = -1, prevH = -1;
double increment = 0.1;
while (inputVideo.grab())
{
Mat image, imageCopy;
inputVideo.retrieve(image);
double tick = (double)getTickCount();
vector< int > ids;
vector< vector< Point2f > > corners, rejected;
vector< Vec3d > rvecs, tvecs;
// detect markers and estimate pose
aruco::detectMarkers(image, dictionary, corners, ids, detectorParams, rejected);
if (estimatePose && ids.size() > 0)
aruco::estimatePoseSingleMarkers(corners, markerLength, camMatrix, distCoeffs, rvecs,
tvecs);
double currentTime = ((double)getTickCount() - tick) / getTickFrequency();
totalTime += currentTime;
totalIterations++;
if (totalIterations % 30 == 0) {
cout << "Detection Time = " << currentTime * 1000 << " ms "
<< "(Mean = " << 1000 * totalTime / double(totalIterations) << " ms)" << endl;
}
// draw results
image.copyTo(imageCopy);
if (ids.size() > 0) {
aruco::drawDetectedMarkers(imageCopy, corners, ids);
if (estimatePose) {
for (unsigned int i = 0; i < ids.size(); i++)
aruco::drawAxis(imageCopy, camMatrix, distCoeffs, rvecs[i], tvecs[i],
markerLength * 0.5f);
}
}
if (ids.size() == 4)
{
if (true)
{
// process the image
array<Point2f, 4> srcCorners; // corner that we want
array<Point2f, 4> dstCorners; // destination corner
vector<Point> marker0; // marker corner
vector<Point> marker1; // marker corner
vector<Point> marker2; // marker corner
vector<Point> marker3; // marker corner
//id 8 14 18 47
for (size_t i = 0; i < ids.size(); i++)
{
// first corner
if (ids[i] == 8)
{
srcCorners[0] = corners[i][0]; // get the first point
//srcCornersSmall[0] = corners[i][2];
marker0.push_back(corners[i][0]);
marker0.push_back(corners[i][1]);
marker0.push_back(corners[i][2]);
marker0.push_back(corners[i][3]);
}
// second corner
else if (ids[i] == 14)
{
srcCorners[1] = corners[i][1]; // get the second point
//srcCornersSmall[1] = corners[i][3];
marker1.push_back(corners[i][0]);
marker1.push_back(corners[i][1]);
marker1.push_back(corners[i][2]);
marker1.push_back(corners[i][3]);
}
// third corner
else if (ids[i] == 18)
{
srcCorners[2] = corners[i][2]; // get the thirt point
//srcCornersSmall[2] = corners[i][0];
marker2.push_back(corners[i][0]);
marker2.push_back(corners[i][1]);
marker2.push_back(corners[i][2]);
marker2.push_back(corners[i][3]);
}
// fourth corner
else if (ids[i] == 47)
{
srcCorners[3] = corners[i][3]; // get the fourth point
//srcCornersSmall[3] = corners[i][1];
marker3.push_back(corners[i][0]);
marker3.push_back(corners[i][1]);
marker3.push_back(corners[i][2]);
marker3.push_back(corners[i][3]);
}
}
// create a black image with the same size of cam image
Mat mask = Mat::zeros(imageCopy.size(), CV_8UC1);
Mat dstImage = Mat::zeros(imageCopy.size(), CV_8UC1);
// draw white fill on marker corners
{
int num = (int)marker0.size();
if (num != 0)
{
const Point * pt4 = &(marker0[0]);
fillPoly(mask, &pt4, &num, 1, Scalar(255, 255, 255), 8);
}
}
{
int num = (int)marker1.size();
if (num != 0)
{
const Point * pt4 = &(marker1[0]);
fillPoly(mask, &pt4, &num, 1, Scalar(255, 255, 255), 8);
}
}
{
int num = (int)marker2.size();
if (num != 0)
{
const Point * pt4 = &(marker2[0]);
fillPoly(mask, &pt4, &num, 1, Scalar(255, 255, 255), 8);
}
}
{
int num = (int)marker3.size();
if (num != 0)
{
const Point * pt4 = &(marker3[0]);
fillPoly(mask, &pt4, &num, 1, Scalar(255, 255, 255), 8);
}
}
// draw the mask
imshow("black white lines", mask);
// we dont have the correct size/aspect ratio
double width = 256.0f, height = 256.0f;
dstCorners[0] = Point2f(0.0f, 0.0f);
dstCorners[1] = Point2f(width, 0.0f);
dstCorners[2] = Point2f(width, height);
dstCorners[3] = Point2f(0.0f, height);
// get perspectivetransform
Mat M = getPerspectiveTransform(srcCorners, dstCorners);
// warp perspective
Mat dst;
Size dsize = Size(cvRound(dstCorners[2].x), cvRound(dstCorners[2].y));
warpPerspective(mask, dst, M, dsize);
// show warped image
imshow("perspective transformed", dst);
// get width and length of the first marker
// start from (0,0) and cross
int cx = 0, cy = 0; // track our current coordinate
Scalar v, vx, vy; // pixel value at coordinate
bool cont = true;
while (cont)
{
v = dst.at<uchar>(cx, cy); // get pixel value at current coordinate
if (cx > 1 && cy > 1)
{
vx = dst.at<uchar>(cx - 1, cy);
vy = dst.at<uchar>(cx, cy - 1);
}
// if pixel not black, continue crossing
if ((int)v.val[0] != 0)
{
cx++;
cy++;
}
// current pixel is black
// if previous y pixel is not black, means that we need to walk the pixel right
else if ((int)((Scalar)dst.at<uchar>(cx, cy - 1)).val[0] != 0)
{
cx = cx + 1;
}
// if previous x pixel is not black, means that we need to walk the pixel down
else if ((int)((Scalar)dst.at<uchar>(cx - 1, cy)).val[0] != 0)
{
cy = cy + 1;
}
// the rest is the same with previous 2, only with higher previous pixel to check
// need to do this because sometimes pixels is jagged
else if ((int)((Scalar)dst.at<uchar>(cx, cy - 2)).val[0] != 0)
{
cx = cx + 1;
}
else if ((int)((Scalar)dst.at<uchar>(cx - 2, cy)).val[0] != 0)
{
cy = cy + 1;
}
else if ((int)((Scalar)dst.at<uchar>(cx, cy - 3)).val[0] != 0)
{
cx = cx + 1;
}
else if ((int)((Scalar)dst.at<uchar>(cx - 3, cy)).val[0] != 0)
{
cy = cy + 1;
}
else if ((int)((Scalar)dst.at<uchar>(cx, cy - 4)).val[0] != 0)
{
cx = cx + 1;
}
else if ((int)((Scalar)dst.at<uchar>(cx - 4, cy)).val[0] != 0)
{
cy = cy + 1;
}
else if ((int)((Scalar)dst.at<uchar>(cx, cy - 5)).val[0] != 0)
{
cx = cx + 1;
}
else if ((int)((Scalar)dst.at<uchar>(cx - 5, cy)).val[0] != 0)
{
cy = cy + 1;
}
else
{
cx = cx - 1;
cy = cy - 1;
cont = false;
}
// reached the end of the picture
if (cx >= dst.cols)
{
cont = false;
}
else if (cy >= dst.rows)
{
cont = false;
}
}
if (cx == cy)
{
//we have perfect square
}
if (cx > cy)
{
// wide
width = (height * ((double)cx / (double)cy));
}
else
{
// tall
height = (width * ((double)cy / (double)cx));
}
// we dont want the size varied too much every frame,
// so limits the increment or decrement for every frame
// initialize first usage
if (prevW<0)
{
prevW = width;
}
if (prevH<0)
{
prevH = height;
}
if (width > prevW + increment)
{
width = prevW + increment;
}
else if (width < prevW - increment)
{
width = prevW - increment;
}
prevW = width;
if (height > prevH + increment)
{
height = prevH + increment;
}
else if (height < prevH - increment)
{
height = prevH - increment;
}
prevH = height;
// show resized image
Size s(width, height);
Mat resized;
resize(dst, resized, s);
imshow("resized", resized);
}
}
if (showRejected && rejected.size() > 0)
aruco::drawDetectedMarkers(imageCopy, rejected, noArray(), Scalar(100, 0, 255));
imshow("out", imageCopy);
if (waitKey(1) == 27) {
break;
}
}
cout << "Hello World!\n";
cin.ignore();
return 0;
}
I'm more interested in a mathematical solution but for now, this suffice. If you guys know a much better approach(faster) let me know.