Using GraphQL API v4
You can use GraphQL API v4 to optimize commits download per branch. In the following method, I've managed to download in a single request 1900 commits (100 commits per branch in 19 different branches) which drastically reduces the number of requests (compared to using REST api).
1 - Get all branches
You will have to get all branches & go through pagination if you have more than 100 branches :
Query :
query($owner:String!, $name:String!, $branchCursor: String!) {
repository(owner: $owner, name: $name) {
refs(first: 100, refPrefix: "refs/heads/",after: $branchCursor) {
totalCount
edges {
node {
name
target {
...on Commit {
history(first:0){
totalCount
}
}
}
}
}
pageInfo {
endCursor
hasNextPage
}
}
}
}
variables :
{
"owner": "google",
"name": "gson",
"branchCursor": ""
}
Try it in the explorer
Note that branchCursor
variable is used when you have more than 100 branches & features the value of pageInfo.endCursor
in the previous request in that case.
2 - Split the branches array into array of 19 branches max
There is some limitation of the number of request per nodes that prevents us from making too much query per node. Here, some testing I've performed showed that we can't go over 19*100 commits in a single query.
Note that in case of repo which have < 19 branches, you don't need to bother about that
3 - Query commits by chunk of 100 for each branch
You can then create your query dynamically for getting the 100 next commits on all branches. An example with 2 branches :
query ($owner: String!, $name: String!) {
repository(owner: $owner, name: $name) {
branch0: ref(qualifiedName: "JsonArrayImplementsList") {
target {
... on Commit {
history(first: 100) {
...CommitFragment
}
}
}
}
branch1: ref(qualifiedName: "master") {
target {
... on Commit {
history(first: 100) {
...CommitFragment
}
}
}
}
}
}
fragment CommitFragment on CommitHistoryConnection {
totalCount
nodes {
oid
message
committedDate
author {
name
email
}
}
pageInfo {
hasNextPage
endCursor
}
}
Try it in the explorer
- The variables used are
owner
for the repo's owner & name
for the name of the repo.
- A fragment in order to avoid duplication of commit history field definition.
You can see that pageInfo.hasNextpage
& pageInfo.endCursor
will be used to go through pagination for each branch. The pagination takes place in history(first: 100)
with specification of the last cursor encountered. For instance the next request will have history(first: 100, after: "6e2fcdcaf252c54a151ce6a4441280e4c54153ae 99")
. For each branch, we have to update the request with the last endCursor
value to query for the 100 next commit.
When pageInfo.hasNextPage
is false
, there is no more page for this branch, so we won't include it in the next request.
When the last branch have pageInfo.hasNextPage
to false
, we have retrieved all commits
Sample implementation
Here is a sample implementation in NodeJS using github-graphql-client. The same method could be implemented in any other language. The following will also store commits in a file commitsX.json
:
var client = require('github-graphql-client');
var fs = require("fs");
const owner = "google";
const repo = "gson";
const accessToken = "YOUR_ACCESS_TOKEN";
const branchQuery = `
query($owner:String!, $name:String!, $branchCursor: String!) {
repository(owner: $owner, name: $name) {
refs(first: 100, refPrefix: "refs/heads/",after: $branchCursor) {
totalCount
edges {
node {
name
target {
...on Commit {
history(first:0){
totalCount
}
}
}
}
}
pageInfo {
endCursor
hasNextPage
}
}
}
}`;
function buildCommitQuery(branches){
var query = `
query ($owner: String!, $name: String!) {
repository(owner: $owner, name: $name) {`;
for (var key in branches) {
if (branches.hasOwnProperty(key) && branches[key].hasNextPage) {
query+=`
${key}: ref(qualifiedName: "${branches[key].name}") {
target {
... on Commit {
history(first: 100, after: ${branches[key].cursor ? '"' + branches[key].cursor + '"': null}) {
...CommitFragment
}
}
}
}`;
}
}
query+=`
}
}`;
query+= commitFragment;
return query;
}
const commitFragment = `
fragment CommitFragment on CommitHistoryConnection {
totalCount
nodes {
oid
message
committedDate
author {
name
email
}
}
pageInfo {
hasNextPage
endCursor
}
}`;
function doRequest(query, variables) {
return new Promise(function (resolve, reject) {
client({
token: accessToken,
query: query,
variables: variables
}, function (err, res) {
if (!err) {
resolve(res);
} else {
console.log(JSON.stringify(err, null, 2));
reject(err);
}
});
});
}
function buildBranchObject(branch){
var refs = {};
for (var i = 0; i < branch.length; i++) {
console.log("branch " + branch[i].node.name);
refs["branch" + i] = {
name: branch[i].node.name,
totalCount: branch[i].node.target.history.totalCount,
cursor: null,
hasNextPage : true,
commits: []
};
}
return refs;
}
async function requestGraphql() {
var iterateBranch = true;
var branches = [];
var cursor = "";
// get all branches
while (iterateBranch) {
let res = await doRequest(branchQuery,{
"owner": owner,
"name": repo,
"branchCursor": cursor
});
iterateBranch = res.data.repository.refs.pageInfo.hasNextPage;
cursor = res.data.repository.refs.pageInfo.endCursor;
branches = branches.concat(res.data.repository.refs.edges);
}
//split the branch array into smaller array of 19 items
var refChunk = [], size = 19;
while (branches.length > 0){
refChunk.push(branches.splice(0, size));
}
for (var j = 0; j < refChunk.length; j++) {
//1) store branches in a format that makes it easy to concat commit when receiving the query result
var refs = buildBranchObject(refChunk[j]);
//2) query commits while there are some pages existing. Note that branches that don't have pages are not
//added in subsequent request. When there are no more page, the loop exit
var hasNextPage = true;
var count = 0;
while (hasNextPage) {
var commitQuery = buildCommitQuery(refs);
console.log("request : " + count);
let commitResult = await doRequest(commitQuery, {
"owner": owner,
"name": repo
});
hasNextPage = false;
for (var key in refs) {
if (refs.hasOwnProperty(key) && commitResult.data.repository[key]) {
isEmpty = false;
let history = commitResult.data.repository[key].target.history;
refs[key].commits = refs[key].commits.concat(history.nodes);
refs[key].cursor = (history.pageInfo.hasNextPage) ? history.pageInfo.endCursor : '';
refs[key].hasNextPage = history.pageInfo.hasNextPage;
console.log(key + " : " + refs[key].commits.length + "/" + refs[key].totalCount + " : " + refs[key].hasNextPage + " : " + refs[key].cursor + " : " + refs[key].name);
if (refs[key].hasNextPage){
hasNextPage = true;
}
}
}
count++;
console.log("------------------------------------");
}
for (var key in refs) {
if (refs.hasOwnProperty(key)) {
console.log(refs[key].totalCount + " : " + refs[key].commits.length + " : " + refs[key].name);
}
}
//3) write commits chunk (up to 19 branches) in a single json file
fs.writeFile("commits" + j + ".json", JSON.stringify(refs, null, 4), "utf8", function(err){
if (err){
console.log(err);
}
console.log("done");
});
}
}
requestGraphql();
This also work with repo with a lot of branches, for instances this one which has more than 700 branches
Rate Limit
Note that while it is true that with GraphQL you can perform a reduced number of requests, it won't necessarily improve your rate limit as the rate limit is based on points & not a limited number of requests : check GraphQL API rate limit