[Python] （多线程版本）抓取聚划算页面商品分析页面获取商品信息并以XML格式保存到本地

xidajiancun

浏览: 452585 次

最近访客更多访客>>

tzk117

sharphero

云飘雾散

makemyownlife

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (1060)

社区版块

存档分类

#!/user/bin/python
# -*- coding: gbk -*-
#Spider.py

import urllib2
import httplib
import StringIO
import gzip
import re
import chardet
import sys
import os
import datetime
import Queue
import threading
from xml.dom.minidom import Document
from BeautifulSoup import BeautifulSoup

## 这段代码是用于解决控制台打印汉字报错的问题
reload(sys)
sys.setdefaultencoding("utf8")
#####################################################

## debug模式开关，开启后可以看到Http请求的头部信息以及debug日志
DEBUG = 1
NO_DEBUG = 0
httplib.HTTPConnection.debuglevel = DEBUG
## 是否显示爬取网页源代码开关
showSrcCode = False
## 压缩方式
ZIP_TYPE = "gzip"

fileName = "auctions"
location = "d://spiderData/"

## header
headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE}
#####################################################

spiderQueue = Queue.Queue()
parseQueue = Queue.Queue()
saveQueue = Queue.Queue()

needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou",
                     "hangzhou":"http://ju.taobao.com/hangzhou",
                     "shanghai":"http://ju.taobao.com/shanghai",
                     "beijing":"http://ju.taobao.com/beijing",
                     "chengdu":"http://ju.taobao.com/chengdu"}

#############class SpiderConfig #####################
class SpiderConfig:
    """
        configuration for spider name and url
    """
    def __init__(self, name, url):
        self.name = name
        self.url = url
#####################################################

##############class SpiderAuctionDomain##############
class SpiderAuctionDomain:
    """
        Store information with auctions spidered by python
    """
    title = ""
    url = ""
    img = ""
    price = ""

    def __init__(self):
        pass

#####################################################

########class SpiderDefaultErrorHandler##############
class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):
    def http_error_default(self, req, fp, code, msg, hdrs):
        """
            default error process handler for spider
        """
        result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp)
        result.status = code
        result.url = req.get_full_url()

        print "<", result.url, "Exception code :", result.status, ">"

        return result
#####################################################

#############class SpiderHandler#####################
class SpiderHandler(threading.Thread):
    """
        spider handler
    """

    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        while True:
            spiderConfig = spiderQueue.get()
            spiderData = SpiderHandler.spider(self, spiderConfig)
            if spiderData is not None and len(spiderData) > 0:
                parseQueue.put({"data":spiderData,"config":spiderConfig})
            spiderQueue.task_done()

    def spider(self, spiderConfig):
        try:
            request = urllib2.Request(spiderConfig.url)

            ## configure request hreader
            for key,val in headerConfig.items():
                request.add_header(key, val)

            ## build opener
            opener = urllib2.build_opener(SpiderDefaultErrorHandler())

            ## open request
            openRequest = opener.open(request)

            ## read data
            spiderData = openRequest.read()

            ## close
            opener.close()

            if 0 == len(spiderData):
                return

            if ZIP_TYPE== openRequest.headers.get("Content-Encoding"):
                spiderData = SpiderHandler.gzipData(self, spiderData)

            if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode:
                print spiderData

            return spiderData
        except Exception,x:
            print "spider process Exception:", x

    def gzipData(self, spiderData):
        """
            get data from gzip
        """
        if 0 == len(spiderData):
            return spiderData
        spiderDataStream = StringIO.StringIO(spiderData)
        spiderData = gzip.GzipFile(fileobj=spiderDataStream).read()
        return spiderData
#####################################################

class ParseHandler(threading.Thread):
    """
        parse html
    """
    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        while True:
            t = parseQueue.get()
            spiderData = t["data"]
            spiderConfig = t["config"]
            auctionList = ParseHandler.parse(self, spiderData)
            saveQueue.put({"auctionList":auctionList, "fileName":spiderConfig.name})
            parseQueue.task_done()

    def parse(self, spiderData):
        """
            parse html content
        """

        if httplib.HTTPConnection.debuglevel == DEBUG:
            charsetAnalyze = chardet.detect(spiderData)
            print "analyze spider data encode :",charsetAnalyze["encoding"]

        print "执行解析", fileName

        soup = BeautifulSoup(spiderData)
        encode = soup.originalEncoding

        encoding = lambda x : x.encode(encode)

        if httplib.HTTPConnection.debuglevel == DEBUG:
            print "识别到编码：", encode
            title = soup.head.title.string
            print encoding(title)

        spiderContents = soup.findAll(name="div", attrs={"class":"main-box  avil"})
        auctions = ["%s" % s for s in spiderContents]

        if auctions is None:
            return

        auctionList = []

        for auc in auctions:
            auctionDomain = SpiderAuctionDomain()
            # parse auction link
            links = re.search(re.compile(r'<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']', re.IGNORECASE), auc)
            if links is not None :
                auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0]))

            #parse auction title
            titles = re.search(re.compile(r"([^>]*)</a></h2>", re.IGNORECASE), auc)
            if titles is not None:
                auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0]))

            #parse auction price
            price = re.search(re.compile(r"<strong class=\"J_juPrices\".*</b>([^<]*)</strong>", re.IGNORECASE), auc)
            if price is not None:
                auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0])

            #parse image url
            imgs = re.search(re.compile(r"<img src=[\'\"]([^>]*)[\'\"]", re.IGNORECASE), auc)
            if imgs is not None:
                auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0])

            auctionList.append(auctionDomain)

        print "成功解析商品信息："
        for a in auctionList:
            print "--->",a.title

        # sort auction list
        auctionList = ParseHandler.sortAuctionList(self, auctionList)

        return auctionList

        print "解析完成"

    def sortAuctionList(self, auctionList):
        """
            冒泡排序，按照价格排序
        """
        length = len(auctionList)
        if length < 2:
            return auctionList
        else:
            for i in range(length-1):
                for j in range(length - i -1):
                    if float(auctionList[j].price) > float(auctionList[j+1].price):
                        auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j]
        return auctionList
        pass

#####################################################

class SaveHandler(threading.Thread):
    """
        save result
    """

    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        while True:
            au = saveQueue.get()
            SaveHandler.save(self, au["auctionList"], au["fileName"])
            saveQueue.task_done()

    def save(self, auctionList, fileName):
        if auctionList is not None:
            doc = Document()

            auctions = doc.createElement("auctions")
            doc.appendChild(auctions)

            for auc in auctionList:
                auction = doc.createElement("auction")
                auctions.appendChild(auction)

                SaveHandler.generateXML(self, doc, auction, "title", auc.title)
                SaveHandler.generateXML(self, doc, auction, "price", auc.price)
                SaveHandler.generateXML(self, doc, auction, "img", auc.img)
                SaveHandler.generateXML(self, doc, auction, "link", auc.link)

            if False == os.path.exists(location):
                os.mkdir(location)

            file = open(location+fileName+".xml", 'w')
            file.write(doc.toprettyxml())
            file.close()

            if httplib.HTTPConnection.debuglevel == DEBUG:
                print doc.toprettyxml()

    def generateXML(self, doc, f, name, txt):
        c = doc.createElement(name)
        f.appendChild(c)
        c.appendChild(doc.createTextNode(txt))
#####################################################


if __name__ == "__main__":
    nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H时%m分%S秒")

    for i in range(5):
        spider = SpiderHandler()
        spider.setDaemon(True)
        spider.start()

        parse = ParseHandler()
        parse.setDaemon(True)
        parse.start()

        save = SaveHandler()
        save.setDaemon(True)
        save.start()

    for k,v in needSpiderUrl.items():
        spiderConfig = SpiderConfig(k, v)
        spiderQueue.put(spiderConfig)

    print "爬虫执行开始时间：",nowtime()

    spiderQueue.join()
    parseQueue.join()
    saveQueue.join()

    print "爬虫执行完毕时间：",nowtime()

分享到：