Python抓取网页内容 – Neohope的网络笔记

1、BeautifulSoup解析网页

'''
Created on 20150203
@author: Hansen
'''

import urllib2
import sys
import io
from bs4 import BeautifulSoup

#Fetch HTML from URL
def fecth_html(index,url,keepHtml,resultFile):
    req = urllib2.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')
    rsp = urllib2.urlopen(req)
    content = rsp.read()
    #receive_header = rsp.info()
    #print(sys.getfilesystemencoding())
    #content = content.decode('utf-8','replace')
    
    if keepHtml:
        fileinfo = open(str(index)+'.html','w')
        fileinfo.write(content)
        print("save file "+ str(index)+'.html:    ok')
        
    parse_html(content,resultFile)
    
#Parse HTML
def parse_html(html,resultFile):
    soup = BeautifulSoup(html,fromEncoding="utf8")
    lis = soup.select('div.shop-all-list li')
    print(len(lis))
    for li in lis:
        szTitle = (li.select('div:nth-of-type(2) div:nth-of-type(1) a h4'))[0].get_text()
        szTitle = szTitle.replace("\r\n", "-").replace(" ","");
        szStar = (li.select('div:nth-of-type(2) div:nth-of-type(3) span'))[0]['title']
        szReviewNum = (li.select('div:nth-of-type(2) div:nth-of-type(3) a:nth-of-type(1)'))[0].get_text()
        szReviewNum = szReviewNum.replace("\n", "").replace(" ","");
        szMeanPrice = (li.select('div:nth-of-type(2) div:nth-of-type(3) a:nth-of-type(2)'))[0].get_text()
        szMeanPrice = szMeanPrice.replace("\n", "").replace(" ","");
        szCategory = (li.select('div:nth-of-type(2) div:nth-of-type(4) a:nth-of-type(1)'))[0].get_text()
        szAddressA = (li.select('div:nth-of-type(2) div:nth-of-type(4) a:nth-of-type(2)'))[0].get_text()
        szAddressB = (li.select('div:nth-of-type(2) div:nth-of-type(4) span:nth-of-type(3)'))[0].get_text()
        szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ","");
        szTaste = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(1)'))[0].get_text()
        szEvn = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(2)'))[0].get_text()
        szService = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(3)'))[0].get_text()
        
        fileinfo = io.open(resultFile,'a',encoding='utf_16')
        fileinfo.write(szTitle+","+szStar+","+szReviewNum+","+szMeanPrice+","+szCategory+"," +szAddress+","+szTaste+","+szEvn+","+szService+"\n")

2、PyQuery解析网页

'''
Created on 20150203
@author: Hansen
'''

import urllib2
import sys
import io
from pyquery import PyQuery

#Fetch HTML from URL
def fecth_html(index,url,keepHtml,resultFile):
    req = urllib2.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')
    rsp = urllib2.urlopen(req)
    content = rsp.read()
    #receive_header = rsp.info()
    #print(sys.getfilesystemencoding())
    #content = content.decode('utf-8','replace')
    
    if keepHtml:
        fileinfo = open(str(index)+'.html','w')
        fileinfo.write(content)
        print("save file "+ str(index)+'.html:    ok')
        
    parse_html(content,resultFile)
    
#Parse HTML
def parse_html(html,resultFile):
    doc = PyQuery(html)
    lis = doc('div.shop-all-list li')
    print(len(lis))
    for li in lis:
        li_doc = PyQuery(li)
        szTitle = li_doc('li div div a h4').text()
        szTitle = szTitle.replace("\r\n", "-").replace(" ","");
        szStar = li_doc("li div div span").filter('.sml-rank-stars').attr('title')
        szReviewNum = li_doc('li div div a').filter('.review-num').text()
        szReviewNum = szReviewNum.replace("\n", "").replace(" ","");
        szMeanPrice = li_doc('li div div a').filter('.mean-price').text()
        szMeanPrice = szMeanPrice.replace("\n", "").replace(" ","");
        szCategory = li_doc('li div div a span').filter('.tag').eq(1).text()
        szAddressA = li_doc('li div div a span').filter('.tag').eq(1).text()
        szAddressB = li_doc('li div div span').filter('.addr').eq(0).text()
        szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ","");
        szTaste = li_doc('li div span span').eq(0).text()
        szEvn = li_doc('li div span span').eq(1).text()
        szService = li_doc('li div span span').eq(2).text()
        
        fileinfo = io.open(resultFile,'a',encoding='utf_16')
        fileinfo.write(szTitle+","+szStar+","+szReviewNum+","+szMeanPrice+","+szCategory+"," +szAddress+","+szTaste+","+szEvn+","+szService+"\n")

Leave a Reply Cancel reply