#!/user/bin/python
# -*- coding: gbk -*-
#Spider.py
import urllib2
import httplib
import StringIO
import gzip
import re
import chardet
import sys
import os
import datetime
import Queue
import threading
from xml.dom.minidom import Document
from BeautifulSoup import BeautifulSoup
## 这段代码是用于解决控制台打印汉字报错的问题
reload(sys)
sys.setdefaultencoding("utf8")
#####################################################
## debug模式开关,开启后可以看到Http请求的头部信息以及debug日志
DEBUG = 1
NO_DEBUG = 0
httplib.HTTPConnection.debuglevel = DEBUG
## 是否显示爬取网页源代码开关
showSrcCode = False
## 压缩方式
ZIP_TYPE = "gzip"
fileName = "auctions"
location = "d://spiderData/"
## header
headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE}
#####################################################
spiderQueue = Queue.Queue()
parseQueue = Queue.Queue()
saveQueue = Queue.Queue()
needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou",
"hangzhou":"http://ju.taobao.com/hangzhou",
"shanghai":"http://ju.taobao.com/shanghai",
"beijing":"http://ju.taobao.com/beijing",
"chengdu":"http://ju.taobao.com/chengdu"}
#############class SpiderConfig #####################
class SpiderConfig:
"""
configuration for spider name and url
"""
def __init__(self, name, url):
self.name = name
self.url = url
#####################################################
##############class SpiderAuctionDomain##############
class SpiderAuctionDomain:
"""
Store information with auctions spidered by python
"""
title = ""
url = ""
img = ""
price = ""
def __init__(self):
pass
#####################################################
########class SpiderDefaultErrorHandler##############
class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):
def http_error_default(self, req, fp, code, msg, hdrs):
"""
default error process handler for spider
"""
result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp)
result.status = code
result.url = req.get_full_url()
print "<", result.url, "Exception code :", result.status, ">"
return result
#####################################################
#############class SpiderHandler#####################
class SpiderHandler(threading.Thread):
"""
spider handler
"""
def __init__(self):
threading.Thread.__init__(self)
def run(self):
while True:
spiderConfig = spiderQueue.get()
spiderData = SpiderHandler.spider(self, spiderConfig)
if spiderData is not None and len(spiderData) > 0:
parseQueue.put({"data":spiderData,"config":spiderConfig})
spiderQueue.task_done()
def spider(self, spiderConfig):
try:
request = urllib2.Request(spiderConfig.url)
## configure request hreader
for key,val in headerConfig.items():
request.add_header(key, val)
## build opener
opener = urllib2.build_opener(SpiderDefaultErrorHandler())
## open request
openRequest = opener.open(request)
## read data
spiderData = openRequest.read()
## close
opener.close()
if 0 == len(spiderData):
return
if ZIP_TYPE== openRequest.headers.get("Content-Encoding"):
spiderData = SpiderHandler.gzipData(self, spiderData)
if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode:
print spiderData
return spiderData
except Exception,x:
print "spider process Exception:", x
def gzipData(self, spiderData):
"""
get data from gzip
"""
if 0 == len(spiderData):
return spiderData
spiderDataStream = StringIO.StringIO(spiderData)
spiderData = gzip.GzipFile(fileobj=spiderDataStream).read()
return spiderData
#####################################################
class ParseHandler(threading.Thread):
"""
parse html
"""
def __init__(self):
threading.Thread.__init__(self)
def run(self):
while True:
t = parseQueue.get()
spiderData = t["data"]
spiderConfig = t["config"]
auctionList = ParseHandler.parse(self, spiderData)
saveQueue.put({"auctionList":auctionList, "fileName":spiderConfig.name})
parseQueue.task_done()
def parse(self, spiderData):
"""
parse html content
"""
if httplib.HTTPConnection.debuglevel == DEBUG:
charsetAnalyze = chardet.detect(spiderData)
print "analyze spider data encode :",charsetAnalyze["encoding"]
print "执行解析", fileName
soup = BeautifulSoup(spiderData)
encode = soup.originalEncoding
encoding = lambda x : x.encode(encode)
if httplib.HTTPConnection.debuglevel == DEBUG:
print "识别到编码:", encode
title = soup.head.title.string
print encoding(title)
spiderContents = soup.findAll(name="div", attrs={"class":"main-box avil"})
auctions = ["%s" % s for s in spiderContents]
if auctions is None:
return
auctionList = []
for auc in auctions:
auctionDomain = SpiderAuctionDomain()
# parse auction link
links = re.search(re.compile(r'<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']', re.IGNORECASE), auc)
if links is not None :
auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0]))
#parse auction title
titles = re.search(re.compile(r"([^>]*)</a></h2>", re.IGNORECASE), auc)
if titles is not None:
auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0]))
#parse auction price
price = re.search(re.compile(r"<strong class=\"J_juPrices\".*</b>([^<]*)</strong>", re.IGNORECASE), auc)
if price is not None:
auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0])
#parse image url
imgs = re.search(re.compile(r"<img src=[\'\"]([^>]*)[\'\"]", re.IGNORECASE), auc)
if imgs is not None:
auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0])
auctionList.append(auctionDomain)
print "成功解析商品信息:"
for a in auctionList:
print "--->",a.title
# sort auction list
auctionList = ParseHandler.sortAuctionList(self, auctionList)
return auctionList
print "解析完成"
def sortAuctionList(self, auctionList):
"""
冒泡排序,按照价格排序
"""
length = len(auctionList)
if length < 2:
return auctionList
else:
for i in range(length-1):
for j in range(length - i -1):
if float(auctionList[j].price) > float(auctionList[j+1].price):
auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j]
return auctionList
pass
#####################################################
class SaveHandler(threading.Thread):
"""
save result
"""
def __init__(self):
threading.Thread.__init__(self)
def run(self):
while True:
au = saveQueue.get()
SaveHandler.save(self, au["auctionList"], au["fileName"])
saveQueue.task_done()
def save(self, auctionList, fileName):
if auctionList is not None:
doc = Document()
auctions = doc.createElement("auctions")
doc.appendChild(auctions)
for auc in auctionList:
auction = doc.createElement("auction")
auctions.appendChild(auction)
SaveHandler.generateXML(self, doc, auction, "title", auc.title)
SaveHandler.generateXML(self, doc, auction, "price", auc.price)
SaveHandler.generateXML(self, doc, auction, "img", auc.img)
SaveHandler.generateXML(self, doc, auction, "link", auc.link)
if False == os.path.exists(location):
os.mkdir(location)
file = open(location+fileName+".xml", 'w')
file.write(doc.toprettyxml())
file.close()
if httplib.HTTPConnection.debuglevel == DEBUG:
print doc.toprettyxml()
def generateXML(self, doc, f, name, txt):
c = doc.createElement(name)
f.appendChild(c)
c.appendChild(doc.createTextNode(txt))
#####################################################
if __name__ == "__main__":
nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H时%m分%S秒")
for i in range(5):
spider = SpiderHandler()
spider.setDaemon(True)
spider.start()
parse = ParseHandler()
parse.setDaemon(True)
parse.start()
save = SaveHandler()
save.setDaemon(True)
save.start()
for k,v in needSpiderUrl.items():
spiderConfig = SpiderConfig(k, v)
spiderQueue.put(spiderConfig)
print "爬虫执行开始时间:",nowtime()
spiderQueue.join()
parseQueue.join()
saveQueue.join()
print "爬虫执行完毕时间:",nowtime()
分享到:
相关推荐
主要为大家详细介绍了Python抓取聚划算商品分析页面获取商品信息,并以XML格式保存到本地的方法,具有一定的参考价值,感兴趣的小伙伴们可以参考一下
一个Python多线程爬虫,在工作时,开10个线程来抓取新浪网页的数据,抓取并保存页面, 并且根据deep返回页面链接,根据key确定是否保存该页面,其中: deep == 0时,是抓取的最后一层深度,即只抓取并保存页面,不...
自己写的一个多线程爬虫,在当前目录下创建目录保存相应图片,共100行代码,可以正常运行
主要介绍了python抓取并保存html页面时乱码问题的解决方法,结合实例形式分析了Python页面抓取过程中乱码出现的原因与相应的解决方法,需要的朋友可以参考下
主要介绍了Python实现抓取HTML网页并以PDF文件形式保存的方法,结合实例形式分析了PyPDF2模块的安装及Python抓取HTML页面并基于PyPDF2模块生成pdf文件的相关操作技巧,需要的朋友可以参考下
基于Linux的python多线程爬虫程序设计.pdf
根据商品id指定需要抓取的商品范围,抓取指定商品详情页价格、库存、运费信息,存入数据库作数据分析和参考 使用步骤: 1.搭建python环境,配置好环境变量 2.配置数据库环境,根据本地数据库连接修改alibaba.py中的...
Python多线程爬虫 功能描述 使用python编写一个网站爬虫程序,支持参数如下: spider.py -u url -d deep -f logfile -l loglevel(1-5) --testself -thread number --dbfile filepath --key=”HTML5” 参数说明: -...
python多线程编程实现网络串口透传, 为TCP客户端网口数据串口透传。
python多线程技术爬取天天基金排行榜所有基金数据,结果并保存到excel并写入mysql数据库。基金股票量化分析利器,分分钟获取股票基金数据。
Python多线程超大日志文件解析转储,实现几十G超大文件并发处理。 实现功能如下: 1.多线程分块解析某超大日志文件,实现超大文件多线程分块处理 2.多线程写入数据到数据库postgresql/mysql 3.线程之间通过队列queue...
Python多线程编程,简要描述了Python中多线程的实现过程
主要介绍了Python基于多线程实现抓取数据存入数据库的方法,结合实例形式分析了Python使用数据库类与多线程类进行数据抓取与写入数据库操作的具体使用技巧,需要的朋友可以参考下
python多线程,断点续传下载程序,功能比较简单,可以进行二次开发。实现更好用的 功能。
此资源是一个基于Python的爬虫脚本,利用urllib库抓取指定贴吧的指定页数据,并将抓取到的内容保存到本地文件中。该脚本可以帮助用户快速获取贴吧中的帖子标题、内容、发布时间等信息,并可以用于数据分析、内容提取...
能根据url的个数快速开启对个线程,单个线程可以实现对同一个url的多次访问,返回访问成功或者失败的结果
实现抓取新闻页面所有新闻链接对应页面到本地
学习用的,python写的多线程抓取代理服务器,保存,验证程序-烤火C知识网