This is my C# based idea on how this issue can be approached, perhaps by not completely solving it, but rather improving the performance. The below shown code solves the problem of "retrieving all commits within the default branch of a repository", however, it can be applied to almost any cursor-based pagination scenario on GitHub GraphQL. I am aware that your question was concerning "all commits of all branches, deduplicated", however, I believe this approach may be useful to you as well.
The inherent problem of querying a large repository is the 100 results per page limit, and the fact that you have to iterate over the pages one by one, as each page contains the cursor to the next page. I've solved the cursor-identification issue in my solution, and by concurrently sending all page requests at once reduced the overall execution time.
The idea is to create an initial request to GitHub GraphQL API fetching only the total count for the given filters. I assumed that we would fetch 100 results per page. As the GitHub commit page cursors are always in the format "xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 99", where the first part is the first commit oid (the first commit of the first page - all cursors on all pages use this oid - it doesn't change when iterating), and 99 is the order number of the last commit of the previous page (0 based index), it is quite easy to calculate what the cursors for each page of a 670 commits repository will be by only making the "totalCount" request:
- null
- "xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 99"
- "xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 199"
- "xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 299"
- "xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 399"
- "xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 499"
- "xX9XXXXXXX3961722145Xf39cc9617XXXXxxx 599"
After the cursors that identify the beginning of each page are generated, we can prepare a separate Task
for each page, where the Task
will contain a request to GitHub GraphQL to fetch one page, and use Task.WhenAll
to execute them all.
I've tested this on a repository of 670 commits, and all 7 pages are fetched within around 7 seconds in total. If I iterate through each page, it takes around 4 seconds per page, which totals to 25 - 30 seconds.
It should be noted that this was not tested in a production environment, it doesn't concern error handling, and the parallelism/concurrency implementation can most probably be improved, so it should only be viewed as a proof of concept. Additionally, I am not sure how GitHub API will handle when you send requests for repositories that have 100 or 1000 pages of commits.
public async Task<List<Commit>> GetCommitsByPeriodAsync(Guid integrationId, DateTime since, string repositoryName, string repositoryOwner)
{
string initialCursor = null;
var firstPageInfo = await GetDefaultBranchCommitsFirstPageInfoAsync(since, initialCursor, repositoryOwner, repositoryName);
var commitPagesCursors = GetCommitPagesCursors(firstPageInfo, initialCursor );
var tasks = commitPagesCursors.Select(x => GetDefaultBranchCommitsPageByPeriodAsync(since, x, repositoryOwner, repositoryName));
var results = await Task.WhenAll(tasks);
var branchCommitsByPeriod = results.SelectMany(x => x.Commits)
.ToList();
return branchCommitsByPeriod;
}
private List<string> GetCommitPagesCursors(GetCommitsPageInfoResponse firstPageInfo, string initialCursor)
{
// Two initial cursors will always be "null", and "oid 99" for 100 items pages
var cursors = new List<string> { initialCursor, firstPageInfo.PageInfo.EndCursor };
int totalCount = firstPageInfo.TotalCount;
var firstCommitCursorSplit = firstPageInfo.PageInfo.EndCursor.Split(" ");
var firstCommitId = firstCommitCursorSplit[0];
var lastPageCommitNumberString = firstCommitCursorSplit[1];
// TO DO: handling TryParse failure scenario
int.TryParse(lastPageCommitNumberString, out int lastPageCommitNumber);
// 100 is the max number of objects in a page
lastPageCommitNumber += 100;
while (lastPageCommitNumber < totalCount)
{
string nextPageCursor = $"{firstCommitId} {lastPageCommitNumber}";
cursors.Add(nextPageCursor);
lastPageCommitNumber += 100;
}
return cursors;
}
public async Task<GetCommitsPageInfoResponse> GetDefaultBranchCommitsFirstPageInfoAsync(DateTime since, string cursor, string repositoryOwner, string repositoryName)
{
// Code omitted for brevity
var commitsRequest = new GraphQLRequest
{
Query = @"
query GetCommitsFirstPage($cursor: String, $commitsSince: GitTimestamp!, $repositoryName: String!, $repositoryOwner: String!) {
repository(name: $repositoryName, owner: $repositoryOwner) {
defaultBranchRef{
target {
... on Commit {
history(after: $cursor, since: $commitsSince) {
totalCount
pageInfo {
endCursor
hasNextPage
}
}
}
}
}
}
}",
OperationName = "GetCommitsFirstPage",
Variables = new
{
commitsSince = since.ToString("o"),
cursor = cursor,
repositoryOwner = repositoryOwner,
repositoryName = repositoryName
}
};
// Code omitted for brevity
}
public async Task<GetCommitsPageResponse> GetDefaultBranchCommitsPageByPeriodAsync(DateTime since, string cursor, string repositoryOwner, string repositoryName)
{
// Code omitted for brevity
var commitsRequest = new GraphQLRequest
{
Query = @"
query GetCommitsSinceTimestamp($cursor: String, $commitsSince: GitTimestamp!, $repositoryName: String!, $repositoryOwner: String!) {
repository(name: $repositoryName, owner: $repositoryOwner) {
defaultBranchRef{
target {
... on Commit {
history(after: $cursor, since: $commitsSince) {
pageInfo {
endCursor
hasNextPage
}
edges {
node {
oid
additions
deletions
commitUrl
url
committedDate
associatedPullRequests (first: 10) {
nodes {
id
mergedAt
}
}
repository {
databaseId
nameWithOwner
}
author {
name
email
user {
login
}
}
message
}
}
}
}
}
}
}
}",
OperationName = "GetCommitsSinceTimestamp",
Variables = new
{
commitsSince = since.ToString("o"),
cursor = cursor,
repositoryOwner = repositoryOwner,
repositoryName = repositoryName
}
};
// Code omitted for brevity
}