1、BeautifulSoup解析网页
''' Created on 20150203 @author: Hansen ''' import urllib2 import sys import io from bs4 import BeautifulSoup #Fetch HTML from URL def fecth_html(index,url,keepHtml,resultFile): req = urllib2.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') rsp = urllib2.urlopen(req) content = rsp.read() #receive_header = rsp.info() #print(sys.getfilesystemencoding()) #content = content.decode('utf-8','replace') if keepHtml: fileinfo = open(str(index)+'.html','w') fileinfo.write(content) print("save file "+ str(index)+'.html: ok') parse_html(content,resultFile) #Parse HTML def parse_html(html,resultFile): soup = BeautifulSoup(html,fromEncoding="utf8") lis = soup.select('div.shop-all-list li') print(len(lis)) for li in lis: szTitle = (li.select('div:nth-of-type(2) div:nth-of-type(1) a h4'))[0].get_text() szTitle = szTitle.replace("\r\n", "-").replace(" ",""); szStar = (li.select('div:nth-of-type(2) div:nth-of-type(3) span'))[0]['title'] szReviewNum = (li.select('div:nth-of-type(2) div:nth-of-type(3) a:nth-of-type(1)'))[0].get_text() szReviewNum = szReviewNum.replace("\n", "").replace(" ",""); szMeanPrice = (li.select('div:nth-of-type(2) div:nth-of-type(3) a:nth-of-type(2)'))[0].get_text() szMeanPrice = szMeanPrice.replace("\n", "").replace(" ",""); szCategory = (li.select('div:nth-of-type(2) div:nth-of-type(4) a:nth-of-type(1)'))[0].get_text() szAddressA = (li.select('div:nth-of-type(2) div:nth-of-type(4) a:nth-of-type(2)'))[0].get_text() szAddressB = (li.select('div:nth-of-type(2) div:nth-of-type(4) span:nth-of-type(3)'))[0].get_text() szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ",""); szTaste = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(1)'))[0].get_text() szEvn = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(2)'))[0].get_text() szService = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(3)'))[0].get_text() fileinfo = io.open(resultFile,'a',encoding='utf_16') fileinfo.write(szTitle+","+szStar+","+szReviewNum+","+szMeanPrice+","+szCategory+"," +szAddress+","+szTaste+","+szEvn+","+szService+"\n")
2、PyQuery解析网页
''' Created on 20150203 @author: Hansen ''' import urllib2 import sys import io from pyquery import PyQuery #Fetch HTML from URL def fecth_html(index,url,keepHtml,resultFile): req = urllib2.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') rsp = urllib2.urlopen(req) content = rsp.read() #receive_header = rsp.info() #print(sys.getfilesystemencoding()) #content = content.decode('utf-8','replace') if keepHtml: fileinfo = open(str(index)+'.html','w') fileinfo.write(content) print("save file "+ str(index)+'.html: ok') parse_html(content,resultFile) #Parse HTML def parse_html(html,resultFile): doc = PyQuery(html) lis = doc('div.shop-all-list li') print(len(lis)) for li in lis: li_doc = PyQuery(li) szTitle = li_doc('li div div a h4').text() szTitle = szTitle.replace("\r\n", "-").replace(" ",""); szStar = li_doc("li div div span").filter('.sml-rank-stars').attr('title') szReviewNum = li_doc('li div div a').filter('.review-num').text() szReviewNum = szReviewNum.replace("\n", "").replace(" ",""); szMeanPrice = li_doc('li div div a').filter('.mean-price').text() szMeanPrice = szMeanPrice.replace("\n", "").replace(" ",""); szCategory = li_doc('li div div a span').filter('.tag').eq(1).text() szAddressA = li_doc('li div div a span').filter('.tag').eq(1).text() szAddressB = li_doc('li div div span').filter('.addr').eq(0).text() szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ",""); szTaste = li_doc('li div span span').eq(0).text() szEvn = li_doc('li div span span').eq(1).text() szService = li_doc('li div span span').eq(2).text() fileinfo = io.open(resultFile,'a',encoding='utf_16') fileinfo.write(szTitle+","+szStar+","+szReviewNum+","+szMeanPrice+","+szCategory+"," +szAddress+","+szTaste+","+szEvn+","+szService+"\n")