1、BeautifulSoup解析网页
'''
Created on 20150203
@author: Hansen
'''
import urllib2
import sys
import io
from bs4 import BeautifulSoup
#Fetch HTML from URL
def fecth_html(index,url,keepHtml,resultFile):
req = urllib2.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')
rsp = urllib2.urlopen(req)
content = rsp.read()
#receive_header = rsp.info()
#print(sys.getfilesystemencoding())
#content = content.decode('utf-8','replace')
if keepHtml:
fileinfo = open(str(index)+'.html','w')
fileinfo.write(content)
print("save file "+ str(index)+'.html: ok')
parse_html(content,resultFile)
#Parse HTML
def parse_html(html,resultFile):
soup = BeautifulSoup(html,fromEncoding="utf8")
lis = soup.select('div.shop-all-list li')
print(len(lis))
for li in lis:
szTitle = (li.select('div:nth-of-type(2) div:nth-of-type(1) a h4'))[0].get_text()
szTitle = szTitle.replace("\r\n", "-").replace(" ","");
szStar = (li.select('div:nth-of-type(2) div:nth-of-type(3) span'))[0]['title']
szReviewNum = (li.select('div:nth-of-type(2) div:nth-of-type(3) a:nth-of-type(1)'))[0].get_text()
szReviewNum = szReviewNum.replace("\n", "").replace(" ","");
szMeanPrice = (li.select('div:nth-of-type(2) div:nth-of-type(3) a:nth-of-type(2)'))[0].get_text()
szMeanPrice = szMeanPrice.replace("\n", "").replace(" ","");
szCategory = (li.select('div:nth-of-type(2) div:nth-of-type(4) a:nth-of-type(1)'))[0].get_text()
szAddressA = (li.select('div:nth-of-type(2) div:nth-of-type(4) a:nth-of-type(2)'))[0].get_text()
szAddressB = (li.select('div:nth-of-type(2) div:nth-of-type(4) span:nth-of-type(3)'))[0].get_text()
szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ","");
szTaste = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(1)'))[0].get_text()
szEvn = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(2)'))[0].get_text()
szService = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(3)'))[0].get_text()
fileinfo = io.open(resultFile,'a',encoding='utf_16')
fileinfo.write(szTitle+","+szStar+","+szReviewNum+","+szMeanPrice+","+szCategory+"," +szAddress+","+szTaste+","+szEvn+","+szService+"\n")
2、PyQuery解析网页
'''
Created on 20150203
@author: Hansen
'''
import urllib2
import sys
import io
from pyquery import PyQuery
#Fetch HTML from URL
def fecth_html(index,url,keepHtml,resultFile):
req = urllib2.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')
rsp = urllib2.urlopen(req)
content = rsp.read()
#receive_header = rsp.info()
#print(sys.getfilesystemencoding())
#content = content.decode('utf-8','replace')
if keepHtml:
fileinfo = open(str(index)+'.html','w')
fileinfo.write(content)
print("save file "+ str(index)+'.html: ok')
parse_html(content,resultFile)
#Parse HTML
def parse_html(html,resultFile):
doc = PyQuery(html)
lis = doc('div.shop-all-list li')
print(len(lis))
for li in lis:
li_doc = PyQuery(li)
szTitle = li_doc('li div div a h4').text()
szTitle = szTitle.replace("\r\n", "-").replace(" ","");
szStar = li_doc("li div div span").filter('.sml-rank-stars').attr('title')
szReviewNum = li_doc('li div div a').filter('.review-num').text()
szReviewNum = szReviewNum.replace("\n", "").replace(" ","");
szMeanPrice = li_doc('li div div a').filter('.mean-price').text()
szMeanPrice = szMeanPrice.replace("\n", "").replace(" ","");
szCategory = li_doc('li div div a span').filter('.tag').eq(1).text()
szAddressA = li_doc('li div div a span').filter('.tag').eq(1).text()
szAddressB = li_doc('li div div span').filter('.addr').eq(0).text()
szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ","");
szTaste = li_doc('li div span span').eq(0).text()
szEvn = li_doc('li div span span').eq(1).text()
szService = li_doc('li div span span').eq(2).text()
fileinfo = io.open(resultFile,'a',encoding='utf_16')
fileinfo.write(szTitle+","+szStar+","+szReviewNum+","+szMeanPrice+","+szCategory+"," +szAddress+","+szTaste+","+szEvn+","+szService+"\n")