Print multiple urls from large text file

Question

I'm trying to find and print all .com urls from a large text file using regex. As there are approx 40 different urls I'm wondering if there is a way to search for them without doing it one by one.

The code I used gets xxxx.com but is missing the https//:www at the beginning. Can anyone tell me how I get the full result? Thank you in advance!

import re 
url = len(".com") 
re = re.compile(r'\w*.com\b', url) 
for line in open("report.txt"): 
    for url in re.findall(line): 
        print url

Take a look at this - http://stackoverflow.com/questions/161738/what-is-the-best-regular-expression-to-check-if-a-string-is-a-valid-url — shaktimaan, Apr 16 '14 at 23:40
`\w` won't match the non-word symbols in `https://www` (or the `.` between the domain and subdomain, if a subdomain is present). Try using `.+\.com\b`. — CAustin, Apr 16 '14 at 23:53

score 0 · Answer 1 · answered Apr 17 '14 at 00:23

This seems to work:

#!/usr/local/cpython-2.7/bin/python

import re

def main():
    regex = re.compile(r'https?://[^ \t]*.com\b', re.MULTILINE | re.DOTALL)

    with open('logs.txt', 'r') as file_:
        text = file_.read()

    for url in regex.findall(text):
        print(url)

main()

HTH

score 0 · Answer 2 · answered Apr 17 '14 at 07:47

#!/usr/bin/python

import urllib
import urlparse
import re
import requests

#
# A class for dealing with links 
#

class linkGrabber:

  linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')

  #
  # Remove White space and hash tags 
  #

  def clean(self,link):
    link = re.sub(' ','',link)
    link = re.sub("#",'',link)
    return link

def depth(self,link):  
    return len(urlparse.urlparse(url).path.split("/")) -1

  def isAbsolute(self,link):
    return len(urlparse.urlparse(link).netloc) > 0

  def isRelative(self,link):
    return len(urlparse.urlparse(link).netloc) < 1

  def grab(self,markup,*args):
    links = self.linkregex.findall(markup)
    relative = []
    absolute = []
    for this in links:
      #this = urlparse.urlparse(this)
      if self.isAbsolute(this) == True:
        absolute.append(this)
      elif  self.isAbsolute(this) == False:
        relative.append(this)
    if len(args) <=0:
      return relative + absolute
    elif "abs" in args:
      return absolute
    else:
      return relative

Print multiple urls from large text file

2 Answers2