Following @kohlehydrat's suggestion:
import urllib2
class TldMatcher(object):
# use class vars for lazy loading
MASTERURL = "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1"
TLDS = None
@classmethod
def loadTlds(cls, url=None):
url = url or cls.MASTERURL
# grab master list
lines = urllib2.urlopen(url).readlines()
# strip comments and blank lines
lines = [ln for ln in (ln.strip() for ln in lines) if len(ln) and ln[:2]!='//']
cls.TLDS = set(lines)
def __init__(self):
if TldMatcher.TLDS is None:
TldMatcher.loadTlds()
def getTld(self, url):
best_match = None
chunks = url.split('.')
for start in range(len(chunks)-1, -1, -1):
test = '.'.join(chunks[start:])
startest = '.'.join(['*']+chunks[start+1:])
if test in TldMatcher.TLDS or startest in TldMatcher.TLDS:
best_match = test
return best_match
def get2ld(self, url):
urls = url.split('.')
tlds = self.getTld(url).split('.')
return urls[-1 - len(tlds)]
def test_TldMatcher():
matcher = TldMatcher()
test_urls = [
'site.co.uk',
'site.com',
'site.me.uk',
'site.jpn.com',
'site.org.uk',
'site.it'
]
errors = 0
for u in test_urls:
res = matcher.get2ld(u)
if res != 'site':
print "Error: found '{0}', should be 'site'".format(res)
errors += 1
if errors==0:
print "Passed!"
return (errors==0)