Here's a Python script for listing and cloning all forks that are ahead.
It doesn't use the API. So it doesn't suffer from a rate limit and doesn't require authentication. But it might require adjustments if the GitHub website design changes.
Unlike the bookmarklet in the other answer that shows links to ZIP files, this script also saves info about the commits because it uses git clone
and also creates a commits.htm
file with the overview.
import requests, re, os, sys, time
def content_from_url(url):
# TODO handle internet being off and stuff
text = requests.get(url).content
return text
ENCODING = "utf-8"
def clone_ahead_forks(forklist_url):
forklist_htm = content_from_url(forklist_url).decode(ENCODING)
with open("forklist.htm", "w", encoding=ENCODING) as text_file:
text_file.write(forklist_htm)
is_root = True
# not working if there are no forks: '<a class="(Link--secondary)?" href="(/([^/"]*)/[^/"]*)">'
for match in re.finditer('<a (class=""|data-pjax="#js-repo-pjax-container") href="(/([^/"]*)/[^/"]*)">', forklist_htm):
fork_url = 'https://github.com'+match.group(2)
fork_owner_login = match.group(3)
fork_htm = content_from_url(fork_url).decode(ENCODING)
match2 = re.search('([0-9]+ commits? ahead(, [0-9]+ commits? behind)?)', fork_htm)
# TODO check whether 'ahead'/'behind'/'even with' appear only once on the entire page - in that case they are not part of the readme, "About" box, etc.
sys.stdout.write('.')
if match2 or is_root:
if match2:
aheadness = match2.group(1) # for example '1 commit ahead, 2 commits behind'
else:
aheadness = 'root repo'
is_root = False # for subsequent iterations
dir = fork_owner_login+' ('+aheadness+')'
print(dir)
if not os.path.exists(dir):
os.mkdir(dir)
os.chdir(dir)
# save commits.htm
commits_htm = content_from_url(fork_url+'/commits').decode(ENCODING)
with open("commits.htm", "w", encoding=ENCODING) as text_file:
text_file.write(commits_htm)
# git clone
os.system('git clone '+fork_url+'.git')
print
# no need to recurse into forks of forks because they are all listed on the initial page and being traversed already
os.chdir('..')
else:
print(dir+' already exists, skipping.')
base_path = os.getcwd()
match_disk_letter = re.search(r'^([a-zA-Z]:\\)', base_path)
with open('repo_urls.txt') as url_file:
for url in url_file:
url = url.strip()
url = re.sub(r'\?[^/]*$', '', url) # remove stings like '?utm_source=...' from the end
print(url)
match = re.search('github.com/([^/]*)/([^/]*)$', url)
if match:
user_name = match.group(1)
repo_name = match.group(2)
print(repo_name)
dirname_for_forks = repo_name+' ('+user_name+')'
if not os.path.exists(dirname_for_forks):
url += "/network/members" # page that lists the forks
TMP_DIR = 'tmp_'+time.strftime("%Y%m%d-%H%M%S")
if match_disk_letter: # if Windows, i.e. if path starts with A:\ or so, run git in A:\tmp_... instead of .\tmp_..., in order to prevent "filename too long" errors
TMP_DIR = match_disk_letter.group(1)+TMP_DIR
print(TMP_DIR)
os.mkdir(TMP_DIR)
os.chdir(TMP_DIR)
clone_ahead_forks(url)
print
os.chdir(base_path)
os.rename(TMP_DIR, dirname_for_forks)
else:
print(dirname_for_forks+' ALREADY EXISTS, SKIPPING.')
print('DONE.')
If you make the file repo_urls.txt
with the following content (you can put several URLs, one URL per line):
https://github.com/cifkao/tonnetz-viz
then you'll get the following directories each of which contains the respective cloned repo:
tonnetz-viz (cifkao)
bakaiadam (2 commits ahead)
chumo (2 commits ahead, 4 commits behind)
cifkao (root repo)
codedot (76 commits ahead, 27 commits behind)
k-hatano (41 commits ahead)
shimafuri (11 commits ahead, 8 commits behind)
If it doesn't work, try earlier versions.