使用NLTK进行分词及词性标注

1、首先是安装
1.1、安装Python 3.4
注意要用32位版本
http://www.python.org/downloads/

1.2、安装Numpy
注意两点,一是不一定所有版本都有windows安装包,二是要找支持python3.4的安装包
http://sourceforge.net/projects/numpy/files/NumPy/

1.3、安装NLTK
注意3.2版本有bug,不要用。
http://pypi.python.org/pypi/nltk

2、下载NLT Data
方法1:
在python中运行:

import nltk
nltk.download()

方法2:
到下面的地址,直接去找链接,然后自己下载解压
https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml

3、进行分词
3.1、设置环境变量

set PYTHON_HOME=C:\NeoLanguages\Python34_x86
set PATH=%PYTHON_HOME%;%PATH%
set NLTK_DATA=D:\NLP\NLTK\nltk_data
@python

3.2、py文件

#!usr/bin/python

import nltk

#测试句子
sentence = "Don’t ever let somebody tell you you can’t do something, not even me. \
You got a dream, you gotta protect it. People can’t do something themselves, \
they wanna tell you you can’t do it. If you want something, go get it. Period."

#分词
tokens = nltk.word_tokenize(sentence)

#词性标注
tagged = nltk.pos_tag(tokens)

#句法分析
entities = nltk.chunk.ne_chunk(tagged)

3.3、逐句运行

D:\MyProjects\NLP\NLTK>python
Python 3.4.4 (v3.4.4:737efcadf5a6, Dec 20 2015, 19:28:18) [MSC v.1600 32 bit (In
tel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> import nltk
>>> sentence = "Don’t ever let somebody tell you you can’t do something, not e
ven me. \
... You got a dream, you gotta protect it. People can’t do something themselves
, \
... they wanna tell you you can’t do it. If you want something, go get it. Peri
od."
>>> tokens = nltk.word_tokenize(sentence)
>>> tagged = nltk.pos_tag(tokens)
>>> entities = nltk.chunk.ne_chunk(tagged)

>>> tokens
['Don’t', 'ever', 'let', 'somebody', 'tell', 'you', 'you', 'can’t', 'do', 'som
ething', ',', 'not', 'even', 'me', '.', 'You', 'got', 'a', 'dream', ',', 'you',
'got', 'ta', 'protect', 'it', '.', 'People', 'can’t', 'do', 'something', 'thems
elves', ',', 'they', 'wan', 'na', 'tell', 'you', 'you', 'can’t', 'do', 'it', '.
', 'If', 'you', 'want', 'something', ',', 'go', 'get', 'it', '.', 'Period', '.']

>>> tagged
[('Don’t', 'NNP'), ('ever', 'RB'), ('let', 'VB'), ('somebody', 'NN'), ('tell',
'VB'), ('you', 'PRP'), ('you', 'PRP'), ('can’t', 'VBP'), ('do', 'VB'), ('someth
ing', 'NN'), (',', ','), ('not', 'RB'), ('even', 'RB'), ('me', 'PRP'), ('.', '.'
), ('You', 'PRP'), ('got', 'VBD'), ('a', 'DT'), ('dream', 'NN'), (',', ','), ('y
ou', 'PRP'), ('got', 'VBD'), ('ta', 'JJ'), ('protect', 'NN'), ('it', 'PRP'), ('.
', '.'), ('People', 'NNS'), ('can’t', 'VBP'), ('do', 'VBP'), ('something', 'NN'
), ('themselves', 'PRP'), (',', ','), ('they', 'PRP'), ('wan', 'VBP'), ('na', 'T
O'), ('tell', 'VB'), ('you', 'PRP'), ('you', 'PRP'), ('can’t', 'VBP'), ('do', '
VB'), ('it', 'PRP'), ('.', '.'), ('If', 'IN'), ('you', 'PRP'), ('want', 'VBP'),
('something', 'NN'), (',', ','), ('go', 'VBP'), ('get', 'VB'), ('it', 'PRP'), ('
.', '.'), ('Period', 'NNP'), ('.', '.')]

>>> entities
Tree('S', [('Don’t', 'NNP'), ('ever', 'RB'), ('let', 'VB'), ('somebody', 'NN'),
 ('tell', 'VB'), ('you', 'PRP'), ('you', 'PRP'), ('can’t', 'VBP'), ('do', 'VB')
, ('something', 'NN'), (',', ','), ('not', 'RB'), ('even', 'RB'), ('me', 'PRP'),
 ('.', '.'), ('You', 'PRP'), ('got', 'VBD'), ('a', 'DT'), ('dream', 'NN'), (',',
 ','), ('you', 'PRP'), ('got', 'VBD'), ('ta', 'JJ'), ('protect', 'NN'), ('it', '
PRP'), ('.', '.'), ('People', 'NNS'), ('can’t', 'VBP'), ('do', 'VBP'), ('someth
ing', 'NN'), ('themselves', 'PRP'), (',', ','), ('they', 'PRP'), ('wan', 'VBP'),
 ('na', 'TO'), ('tell', 'VB'), ('you', 'PRP'), ('you', 'PRP'), ('can’t', 'VBP')
, ('do', 'VB'), ('it', 'PRP'), ('.', '.'), ('If', 'IN'), ('you', 'PRP'), ('want'
, 'VBP'), ('something', 'NN'), (',', ','), ('go', 'VBP'), ('get', 'VB'), ('it',
'PRP'), ('.', '.'), Tree('PERSON', [('Period', 'NNP')]), ('.', '.')])
>>>

Python抓取JSON网页内容

# -*- coding: UTF-8 -*-
'''
Created on 20150206

@author: Hansen
'''

import urllib2
import sys
import io
import json

#Fetch HTML from URL
def fecth_html(index,url,keepHtml,resultFile):
    req = urllib2.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')
    rsp = urllib2.urlopen(req)
    content = rsp.read()
    #receive_header = rsp.info()
    #print(sys.getfilesystemencoding())
    #content = content.decode('utf-8','replace')
    
    if keepHtml:
        fileinfo = open(str(index)+'.html','w')
        fileinfo.write(content)
        print("save file "+ str(index)+'.html:    ok')
        
    parse_json(content,resultFile)
    
#Parse HTML
def parse_json(content,resultFile):
    jsonData = json.loads(content)
    shops = jsonData['shopBeans']
    print(len(shops))
    for shop in shops:
        szTitle = shop['filterFullName']
        szTitle = szTitle.replace("\r\n", "-").replace(" ","");
        szStar = shop['shopPowerTitle']
        szMeanPrice = str(shop['avgPrice'])
        szMeanPrice = szMeanPrice.replace("\n", "").replace(" ","");
        szAddressA = shop['mainRegionName']
        szAddressB = shop['address']
        szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ","");
        szTaste = shop['refinedScore1']
        szEvn = shop['refinedScore2']
        szService = shop['refinedScore3']
        
        fileinfo = io.open(resultFile,'a',encoding='utf_16')
        fileinfo.write(szTitle+","+szStar+","+szMeanPrice+","+szAddress+"," +szTaste+","+szEvn+","+szService+"\n")

CSharp抓取JSON网页内容

    using Newtonsoft.Json;
    using DaZhongDianPing.JsonBeans;

    class JsonCrawler
    {
        private PhaseResultBean PhaseJson(Uri uri, String szResultPath, String szErrorPath)
        {
            PhaseResultBean result = new PhaseResultBean();

            try
            {
                //取回网页
                WebClient client = new WebClient();
                client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
                Byte[] pageData = client.DownloadData(uri);
                string pageHtml = Encoding.UTF8.GetString(pageData);
                JsonBeans.TopShopsBean topShops = JsonConvert.DeserializeObject<JsonBeans.TopShopsBean>(pageHtml);

                //分析Json
                int len = topShops.ShopBeans.Length;
                result.total = len;

                foreach (ShopBean shop in topShops.ShopBeans)
                {
                    try
                    {
                        String szTitle = shop.FilterFullName;
                        if (szTitle != null) szTitle = szTitle.Replace("\r\n", "-");
                        String szStar = shop.ShopPowerTitle;
                        String szMeanPrice = shop.AvgPrice.ToString();
                        String szRegionName = shop.MainRegionName;
                        String szAddress = shop.Address;
                        if (szAddress != null) szAddress.Replace(",", "-");

                        String szTaste = shop.RefinedScore1;
                        String szEvn = shop.RefinedScore2;
                        String szService = shop.RefinedScore3;

                        //将获取的内容写入文本
                        using (StreamWriter sw = new StreamWriter(szResultPath, true))
                        {
                            sw.WriteLine(szTitle + "," + szStar + "," + szMeanPrice + "," + szRegionName + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService);
                        }

                        result.successed += 1;
                    }
                    catch (Exception Ex)
                    {
                        using (StreamWriter sw = new StreamWriter(szErrorPath, true))
                        {
                            sw.WriteLine(Ex.Message);
                        } 
                        result.failed += 1;
                    }
                }
            }
            catch (WebException webEx)
            {
                using (StreamWriter sw = new StreamWriter(szErrorPath, true))
                {
                    sw.WriteLine(webEx.Message);
                }
                result.bSuccess = false;
            }

            return result;
        }
    }

    class PhaseResultBean
    {
        public Boolean bSuccess;
        public int total;
        public int successed;
        public int failed;
    }

    public enum JsonEnginType
    {
        JsonEngin_Newtonsoft
    }

    internal class ShopBean
    {

        [JsonProperty("addDate")]
        public string AddDate { get; set; }

        [JsonProperty("addUser")]
        public object AddUser { get; set; }

        [JsonProperty("addUserName")]
        public object AddUserName { get; set; }

        [JsonProperty("address")]
        public string Address { get; set; }

        [JsonProperty("altName")]
        public string AltName { get; set; }

        [JsonProperty("avgPrice")]
        public int AvgPrice { get; set; }

        [JsonProperty("branchName")]
        public string BranchName { get; set; }

        [JsonProperty("branchTotal")]
        public int BranchTotal { get; set; }

        [JsonProperty("businessHours")]
        public string BusinessHours { get; set; }

        [JsonProperty("canSendSms")]
        public object CanSendSms { get; set; }

        [JsonProperty("categoryId")]
        public int CategoryId { get; set; }

        [JsonProperty("cityId")]
        public int CityId { get; set; }

        [JsonProperty("crossRoad")]
        public string CrossRoad { get; set; }

        [JsonProperty("defaultPic")]
        public string DefaultPic { get; set; }

        [JsonProperty("defaultPicBig")]
        public object DefaultPicBig { get; set; }

        [JsonProperty("dishTagList")]
        public string[][] DishTagList { get; set; }

        [JsonProperty("dishTags")]
        public string DishTags { get; set; }

        [JsonProperty("district")]
        public int District { get; set; }

        [JsonProperty("districtName")]
        public object DistrictName { get; set; }

        [JsonProperty("filterFullAdress")]
        public string FilterFullAdress { get; set; }

        [JsonProperty("filterFullName")]
        public string FilterFullName { get; set; }

        [JsonProperty("firstReviewId")]
        public int FirstReviewId { get; set; }

        [JsonProperty("firstUserFace")]
        public object FirstUserFace { get; set; }

        [JsonProperty("firstUserNickName")]
        public object FirstUserNickName { get; set; }

        [JsonProperty("fullAdress")]
        public string FullAdress { get; set; }

        [JsonProperty("fullName")]
        public string FullName { get; set; }

        [JsonProperty("glat")]
        public object Glat { get; set; }

        [JsonProperty("glng")]
        public object Glng { get; set; }

        [JsonProperty("groupFlag")]
        public object GroupFlag { get; set; }

        [JsonProperty("hasStaticMap")]
        public object HasStaticMap { get; set; }

        [JsonProperty("hits")]
        public int Hits { get; set; }

        [JsonProperty("isUserCanUpdate")]
        public object IsUserCanUpdate { get; set; }

        [JsonProperty("lastDate")]
        public string LastDate { get; set; }

        [JsonProperty("lastIp")]
        public object LastIp { get; set; }

        [JsonProperty("lastUser")]
        public object LastUser { get; set; }

        [JsonProperty("lastUserName")]
        public object LastUserName { get; set; }

        [JsonProperty("mainCategoryId")]
        public int MainCategoryId { get; set; }

        [JsonProperty("mainCategoryName")]
        public object MainCategoryName { get; set; }

        [JsonProperty("mainRegionId")]
        public int MainRegionId { get; set; }

        [JsonProperty("mainRegionName")]
        public string MainRegionName { get; set; }

        [JsonProperty("minUserMana")]
        public object MinUserMana { get; set; }

        [JsonProperty("monthlyHits")]
        public int MonthlyHits { get; set; }

        [JsonProperty("nearByTags")]
        public object NearByTags { get; set; }

        [JsonProperty("nearbyShops")]
        public object NearbyShops { get; set; }

        [JsonProperty("oldChainId")]
        public object OldChainId { get; set; }

        [JsonProperty("phoneNo")]
        public string PhoneNo { get; set; }

        [JsonProperty("phoneNo2")]
        public string PhoneNo2 { get; set; }

        [JsonProperty("picTotal")]
        public int PicTotal { get; set; }

        [JsonProperty("popularity")]
        public int Popularity { get; set; }

        [JsonProperty("power")]
        public int Power { get; set; }

        [JsonProperty("prevWeeklyHits")]
        public object PrevWeeklyHits { get; set; }

        [JsonProperty("priceInfo")]
        public object PriceInfo { get; set; }

        [JsonProperty("priceLevel")]
        public int PriceLevel { get; set; }

        [JsonProperty("primaryTag")]
        public string PrimaryTag { get; set; }

        [JsonProperty("promoId")]
        public int PromoId { get; set; }

        [JsonProperty("publicTransit")]
        public string PublicTransit { get; set; }

        [JsonProperty("refinedScore1")]
        public string RefinedScore1 { get; set; }

        [JsonProperty("refinedScore2")]
        public string RefinedScore2 { get; set; }

        [JsonProperty("refinedScore3")]
        public string RefinedScore3 { get; set; }

        [JsonProperty("regionId")]
        public int RegionId { get; set; }

        [JsonProperty("score")]
        public int Score { get; set; }

        [JsonProperty("score1")]
        public int Score1 { get; set; }

        [JsonProperty("score2")]
        public int Score2 { get; set; }

        [JsonProperty("score3")]
        public int Score3 { get; set; }

        [JsonProperty("score4")]
        public int Score4 { get; set; }

        [JsonProperty("searchKeyWord")]
        public object SearchKeyWord { get; set; }

        [JsonProperty("searchName")]
        public object SearchName { get; set; }

        [JsonProperty("shopGroupId")]
        public int ShopGroupId { get; set; }

        [JsonProperty("shopId")]
        public int ShopId { get; set; }

        [JsonProperty("shopName")]
        public string ShopName { get; set; }

        [JsonProperty("shopPower")]
        public int ShopPower { get; set; }

        [JsonProperty("shopPowerTitle")]
        public string ShopPowerTitle { get; set; }

        [JsonProperty("shopTagList")]
        public string[][] ShopTagList { get; set; }

        [JsonProperty("shopTags")]
        public string ShopTags { get; set; }

        [JsonProperty("shopTotalName")]
        public string ShopTotalName { get; set; }

        [JsonProperty("shopType")]
        public int ShopType { get; set; }

        [JsonProperty("similarShops")]
        public object SimilarShops { get; set; }

        [JsonProperty("suggestGA")]
        public object SuggestGA { get; set; }

        [JsonProperty("suggestReason")]
        public object SuggestReason { get; set; }

        [JsonProperty("todayHits")]
        public object TodayHits { get; set; }

        [JsonProperty("voteTotal")]
        public int VoteTotal { get; set; }

        [JsonProperty("webSite")]
        public object WebSite { get; set; }

        [JsonProperty("weeklyHits")]
        public int WeeklyHits { get; set; }

        [JsonProperty("wishTotal")]
        public object WishTotal { get; set; }

        [JsonProperty("writeUp")]
        public string WriteUp { get; set; }
    }

    internal class TopShopsBean
    {

        [JsonProperty("categoryId")]
        public int CategoryId { get; set; }

        [JsonProperty("cityId")]
        public int CityId { get; set; }

        [JsonProperty("maxResults")]
        public int MaxResults { get; set; }

        [JsonProperty("rankType")]
        public int RankType { get; set; }

        [JsonProperty("shopBeans")]
        public ShopBean[] ShopBeans { get; set; }

        [JsonProperty("shopType")]
        public int ShopType { get; set; }

        [JsonProperty("skipResults")]
        public int SkipResults { get; set; }
    }

CSharp抓取HTML网页内容

    using mshtml;
    using HtmlAgilityPack;

    class HTMLCrawler
    {
        private PhaseResultBean PhaseHtml(int index, Uri uri, String szResultPath, String szErrorPath, HTMLEnginType htmlEngin)
        {
            PhaseResultBean result = new PhaseResultBean();
            try
            {
                WebClient client = new WebClient();
                client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
                Byte[] pageData = client.DownloadData(uri);
                string pageHtml = Encoding.UTF8.GetString(pageData);
                if (checkSavePages.Checked)
                {
                    String szHtmlPath = XWin32.getExeParentPath() + index.ToString()+".html";
                    using (StreamWriter sw = new StreamWriter(szHtmlPath, true))
                    {
                        sw.WriteLine(pageHtml);
                    }
                }

                switch(htmlEngin)
                {
                    case HTMLEnginType.HTMLEngin_mshtml:
                        PhaseHtml_mshtml(pageHtml, szResultPath, szErrorPath, result);
                        break;
                    case HTMLEnginType.HTMLEngin_HtmlAgilityPack:
                        PhaseHtml_HtmlAgilityPack(pageHtml, szResultPath, szErrorPath, result);
                        break;
                }
            }
            catch (WebException webEx)
            {
                using (StreamWriter sw = new StreamWriter(szErrorPath, true))
                {
                    sw.WriteLine(webEx.Message);
                }
                result.bSuccess = false;
            }

            return result;
        }

        private void PhaseHtml_mshtml(String pageHtml, String szResultPath, String szErrorPath, PhaseResultBean result)
        {
            mshtml.HTMLDocument docObject = new mshtml.HTMLDocument();
            mshtml.IHTMLDocument2 doc2 = docObject as mshtml.IHTMLDocument2;
            doc2.write(pageHtml);
            doc2.close();

            mshtml.IHTMLDocument3 doc3 = docObject as mshtml.IHTMLDocument3;

            int len = doc3.getElementById("shop-all-list").children[0].children.length;
            result.total += len;

            foreach (IHTMLElement li in doc3.getElementById("shop-all-list").children[0].children)
            {
                try
                {
                    IHTMLElement title = li.children[1].children[0];
                    String szTitle = title.innerText;
                    if (szTitle != null) szTitle = szTitle.Replace("\r\n", "-");
                    IHTMLElement star = li.children[1].children[1].children[0];
                    String szStar = star.getAttribute("title");
                    IHTMLElement reviewNum = li.children[1].children[1].children[1];
                    String szReviewNum = reviewNum.innerText;
                    IHTMLElement meanPrice = li.children[1].children[1].children[3];
                    String szMeanPrice = meanPrice.innerText;
                    IHTMLElement category = li.children[1].children[2].children[0];
                    String szCategory = category.innerText;
                    IHTMLElement address = li.children[1].children[2].children[3];
                    String szAddress = address.innerText;
                    if (szAddress != null) szAddress.Replace(",", "-");

                    IHTMLElement taste = li.children[1].children[3].children[0];
                    String szTaste = taste.innerText;
                    IHTMLElement evn = li.children[1].children[3].children[1];
                    String szEvn = evn.innerText;
                    IHTMLElement service = li.children[1].children[3].children[2];
                    String szService = service.innerText;

                    //将获取的内容写入文本
                    using (StreamWriter sw = new StreamWriter(szResultPath, true))
                    {
                        sw.WriteLine(szTitle + "," + szStar + "," + szReviewNum + "," + szMeanPrice + "," + szCategory + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService);
                    }
                }
                catch (Exception Ex)
                {
                    using (StreamWriter sw = new StreamWriter(szErrorPath, true))
                    {
                        sw.WriteLine(Ex.Message);
                    }

                    result.failed += 1;
                }

            }
        }

        private void PhaseHtml_HtmlAgilityPack(String pageHtml, String szResultPath, String szErrorPath, PhaseResultBean result)
        {
            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(pageHtml);

            HtmlAgilityPack.HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("/html[1]/body[1]/div[4]/div[3]/div[1]/div[1]/div[2]/ul[1]/li");
            result.total += nodes.Count;

            foreach (HtmlAgilityPack.HtmlNode li in nodes)
            {
                try
                {
                    HtmlAgilityPack.HtmlNode titleA = li.SelectNodes("div[2]/div[1]/a[1]")[0];
                    HtmlAgilityPack.HtmlNode titleB = li.SelectNodes("div[2]/div[1]/a[2]") == null ? null : li.SelectNodes("div[2]/div[1]/a[2]")[0];
                    String szTitle = (titleA==null?"":titleA.InnerText) + "-" + (titleB == null ? "" : titleB.InnerText);
                    if (szTitle != null) szTitle = szTitle.Replace("\n", "");
                    if (szTitle != null) szTitle = szTitle.Replace(" ", "");

                    HtmlAgilityPack.HtmlNode star = li.SelectNodes("div[2]/div[2]/span[1]")[0];
                    String szStar = star.Attributes["title"].Value.ToString();

                    HtmlAgilityPack.HtmlNode reviewNum = li.SelectNodes("div[2]/div[2]/a[1]")[0];
                    String szReviewNum = reviewNum.InnerText;
                    if (szReviewNum != null) szReviewNum = szReviewNum.Replace("\n", "");
                    if (szReviewNum != null) szReviewNum = szReviewNum.Replace(" ", "");

                    HtmlAgilityPack.HtmlNode meanPrice = li.SelectNodes("div[2]/div[2]/a[2]")[0];
                    String szMeanPrice = meanPrice.InnerText;
                    if (szMeanPrice != null) szMeanPrice = szMeanPrice.Replace("\n", "");
                    if (szMeanPrice != null) szMeanPrice = szMeanPrice.Replace(" ", "");

                    HtmlAgilityPack.HtmlNode category = li.SelectNodes("div[2]/div[3]/a[1]")[0];
                    String szCategory = category.InnerText;

                    HtmlAgilityPack.HtmlNode addressA = li.SelectNodes("div[2]/div[3]/a[2]")[0];
                    HtmlAgilityPack.HtmlNode addressB = li.SelectNodes("div[2]/div[3]/span[1]")[0];
                    String szAddress = addressA.InnerText + "-" + addressB.InnerText;
                    if (szAddress != null) szAddress.Replace(",", "-");

                    HtmlAgilityPack.HtmlNode taste = li.SelectNodes("div[2]/span[1]/span[1]")[0];
                    String szTaste = taste.InnerText;
                    HtmlAgilityPack.HtmlNode evn = li.SelectNodes("div[2]/span[1]/span[2]")[0];
                    String szEvn = evn.InnerText;
                    HtmlAgilityPack.HtmlNode service = li.SelectNodes("div[2]/span[1]/span[3]")[0];
                    String szService = service.InnerText;

                    //将获取的内容写入文本
                    using (StreamWriter sw = new StreamWriter(szResultPath, true))
                    {
                        sw.WriteLine(szTitle + "," + szStar + "," + szReviewNum + "," + szMeanPrice + "," + szCategory + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService);
                    }
                }
                catch (Exception Ex)
                {
                    using (StreamWriter sw = new StreamWriter(szErrorPath, true))
                    {
                        sw.WriteLine(Ex.Message);
                    }

                    result.failed += 1;
                }

            }
        }
    }

    class PhaseResultBean
    {
        public Boolean bSuccess;
        public int total;
        public int successed;
        public int failed;
    }

    public enum HTMLEnginType
    {
        HTMLEngin_mshtml,
        HTMLEngin_HtmlAgilityPack
    }

Python抓取网页内容

1、BeautifulSoup解析网页

'''
Created on 20150203
@author: Hansen
'''

import urllib2
import sys
import io
from bs4 import BeautifulSoup

#Fetch HTML from URL
def fecth_html(index,url,keepHtml,resultFile):
    req = urllib2.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')
    rsp = urllib2.urlopen(req)
    content = rsp.read()
    #receive_header = rsp.info()
    #print(sys.getfilesystemencoding())
    #content = content.decode('utf-8','replace')
    
    if keepHtml:
        fileinfo = open(str(index)+'.html','w')
        fileinfo.write(content)
        print("save file "+ str(index)+'.html:    ok')
        
    parse_html(content,resultFile)
    
#Parse HTML
def parse_html(html,resultFile):
    soup = BeautifulSoup(html,fromEncoding="utf8")
    lis = soup.select('div.shop-all-list li')
    print(len(lis))
    for li in lis:
        szTitle = (li.select('div:nth-of-type(2) div:nth-of-type(1) a h4'))[0].get_text()
        szTitle = szTitle.replace("\r\n", "-").replace(" ","");
        szStar = (li.select('div:nth-of-type(2) div:nth-of-type(3) span'))[0]['title']
        szReviewNum = (li.select('div:nth-of-type(2) div:nth-of-type(3) a:nth-of-type(1)'))[0].get_text()
        szReviewNum = szReviewNum.replace("\n", "").replace(" ","");
        szMeanPrice = (li.select('div:nth-of-type(2) div:nth-of-type(3) a:nth-of-type(2)'))[0].get_text()
        szMeanPrice = szMeanPrice.replace("\n", "").replace(" ","");
        szCategory = (li.select('div:nth-of-type(2) div:nth-of-type(4) a:nth-of-type(1)'))[0].get_text()
        szAddressA = (li.select('div:nth-of-type(2) div:nth-of-type(4) a:nth-of-type(2)'))[0].get_text()
        szAddressB = (li.select('div:nth-of-type(2) div:nth-of-type(4) span:nth-of-type(3)'))[0].get_text()
        szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ","");
        szTaste = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(1)'))[0].get_text()
        szEvn = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(2)'))[0].get_text()
        szService = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(3)'))[0].get_text()
        
        fileinfo = io.open(resultFile,'a',encoding='utf_16')
        fileinfo.write(szTitle+","+szStar+","+szReviewNum+","+szMeanPrice+","+szCategory+"," +szAddress+","+szTaste+","+szEvn+","+szService+"\n")

2、PyQuery解析网页

'''
Created on 20150203
@author: Hansen
'''

import urllib2
import sys
import io
from pyquery import PyQuery

#Fetch HTML from URL
def fecth_html(index,url,keepHtml,resultFile):
    req = urllib2.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')
    rsp = urllib2.urlopen(req)
    content = rsp.read()
    #receive_header = rsp.info()
    #print(sys.getfilesystemencoding())
    #content = content.decode('utf-8','replace')
    
    if keepHtml:
        fileinfo = open(str(index)+'.html','w')
        fileinfo.write(content)
        print("save file "+ str(index)+'.html:    ok')
        
    parse_html(content,resultFile)
    
#Parse HTML
def parse_html(html,resultFile):
    doc = PyQuery(html)
    lis = doc('div.shop-all-list li')
    print(len(lis))
    for li in lis:
        li_doc = PyQuery(li)
        szTitle = li_doc('li div div a h4').text()
        szTitle = szTitle.replace("\r\n", "-").replace(" ","");
        szStar = li_doc("li div div span").filter('.sml-rank-stars').attr('title')
        szReviewNum = li_doc('li div div a').filter('.review-num').text()
        szReviewNum = szReviewNum.replace("\n", "").replace(" ","");
        szMeanPrice = li_doc('li div div a').filter('.mean-price').text()
        szMeanPrice = szMeanPrice.replace("\n", "").replace(" ","");
        szCategory = li_doc('li div div a span').filter('.tag').eq(1).text()
        szAddressA = li_doc('li div div a span').filter('.tag').eq(1).text()
        szAddressB = li_doc('li div div span').filter('.addr').eq(0).text()
        szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ","");
        szTaste = li_doc('li div span span').eq(0).text()
        szEvn = li_doc('li div span span').eq(1).text()
        szService = li_doc('li div span span').eq(2).text()
        
        fileinfo = io.open(resultFile,'a',encoding='utf_16')
        fileinfo.write(szTitle+","+szStar+","+szReviewNum+","+szMeanPrice+","+szCategory+"," +szAddress+","+szTaste+","+szEvn+","+szService+"\n")