0

I'm trying to find and print all .com urls from a large text file using regex. As there are approx 40 different urls I'm wondering if there is a way to search for them without doing it one by one.

The code I used gets xxxx.com but is missing the https//:www at the beginning. Can anyone tell me how I get the full result? Thank you in advance!

import re 
url = len(".com") 
re = re.compile(r'\w*.com\b', url) 
for line in open("report.txt"): 
    for url in re.findall(line): 
        print url
Andrew Barber
  • 39,603
  • 20
  • 94
  • 123
  • Take a look at this - http://stackoverflow.com/questions/161738/what-is-the-best-regular-expression-to-check-if-a-string-is-a-valid-url – shaktimaan Apr 16 '14 at 23:40
  • `\w` won't match the non-word symbols in `https://www` (or the `.` between the domain and subdomain, if a subdomain is present). Try using `.+\.com\b`. – CAustin Apr 16 '14 at 23:53

2 Answers2

0

This seems to work:

#!/usr/local/cpython-2.7/bin/python

import re

def main():
    regex = re.compile(r'https?://[^ \t]*.com\b', re.MULTILINE | re.DOTALL)

    with open('logs.txt', 'r') as file_:
        text = file_.read()

    for url in regex.findall(text):
        print(url)

main()

HTH

dstromberg
  • 6,954
  • 1
  • 26
  • 27
0
#!/usr/bin/python

import urllib
import urlparse
import re
import requests

#
# A class for dealing with links 
#

class linkGrabber:

  linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>')

  #
  # Remove White space and hash tags 
  #

  def clean(self,link):
    link = re.sub(' ','',link)
    link = re.sub("#",'',link)
    return link

def depth(self,link):  
    return len(urlparse.urlparse(url).path.split("/")) -1

  def isAbsolute(self,link):
    return len(urlparse.urlparse(link).netloc) > 0

  def isRelative(self,link):
    return len(urlparse.urlparse(link).netloc) < 1

  def grab(self,markup,*args):
    links = self.linkregex.findall(markup)
    relative = []
    absolute = []
    for this in links:
      #this = urlparse.urlparse(this)
      if self.isAbsolute(this) == True:
        absolute.append(this)
      elif  self.isAbsolute(this) == False:
        relative.append(this)
    if len(args) <=0:
      return relative + absolute
    elif "abs" in args:
      return absolute
    else:
      return relative
Ricky Wilson
  • 3,187
  • 4
  • 24
  • 29