1

I want to get incubator information by web-scraping, and I use python.but I get nothing after running my code. Here are my code.Need your help!

import requests
from requests.exceptions import RequestException
import re
def get_one_page(url):
try:
    r = requests.get(url)
    if r.status_code == 200:
        return r.text
    return None
except RequestException:
    return None
def parse_one_page(html):
    pattern = re.compile('f14px c-blue.*?><a.*?>(.*?)</a>.*?fn14px c-666>(.*?)</td>')
    items = re.findall(pattern, html)
    for item in items:
        yield {
           'name': item[0],
           'address': item[1]
        }
def main(offset):
    url = 'http://www.cnfuhuaqi.com/couveuse/0-0-0-0-0-d%.aspx' % offset
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
if __name__ == '__main__':
     for i in range(2, 72):
          main(i)
Bertrand Martel
  • 42,756
  • 16
  • 135
  • 159
Jack Zhang
  • 23
  • 2

1 Answers1

1

Never parse html with regex, use an html parser such as BeautifulSoup. In your case, you only need to select the element with zjfw-list-con class and extract the tables inside it. The following will extract the image src url, the link and the description for 2 iterations (2 and 3):

from bs4 import BeautifulSoup
import requests

incubators = []

def extract_data(url):
    print("get data from {}".format(url))
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    tables = soup.find_all("div", {"class":"zjfw-list-con"})[0].find_all("table")

    for table in tables:
        for subtable in table.find_all('table'):
            items = subtable.find('tr').find_all('td')
            item_tuple = (
                items[0].find('img')['src'],
                items[1].find('a')['href'],
                items[2].text.strip()
            )
            print(item_tuple)
            incubators.append(item_tuple)

url = 'http://www.cnfuhuaqi.com/couveuse/0-0-0-0-0-%d.aspx'

for i in range(2, 4):
    extract_data(url % i)

print("the full list : ")
for i in incubators:
    print(' '.join(i))
Bertrand Martel
  • 42,756
  • 16
  • 135
  • 159