使用Python编写爬虫的基本模块及框架使用指南
|
基本模块 基本模块使用的是 urllib,urllib2,re,等模块 基本用法,例子: (1)进行基本GET请求,获取网页html
#!coding=utf-8
import urllib
import urllib2
url = 'http://www.baidu.com/'
# 获取请求
request = urllib2.Request(url)
try:
# 根据request,得到返回response
response = urllib2.urlopen(request)
except urllib2.HTTPError,e:
if hasattr(e,'reason'):
print e.reason
# 读取response的body
html = response.read()
# 读取response的headers
headers = response.info()
#!coding=utf-8
import urllib2
import urllib
post_url = ''
post_data = urllib.urlencode({
'username': 'username','password': 'password',})
post_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0',}
request = urllib2.Request(
url=post_url,data=post_data,headers=post_headers,)
response = urllib2.urlopen(request)
html = response.read()
(3)
#!coding=utf-8
import urllib2
import re
page_num = 1
url = 'http://tieba.baidu.com/p/3238280985?see_lz=1&pn='+str(page_num)
myPage = urllib2.urlopen(url).read().decode('gbk')
myRe = re.compile(r'class="d_post_content j_d_post_content ">(.*?)</div>',re.DOTALL)
items = myRe.findall(myPage)
f = open('baidu.txt','a+')
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
i = 0
texts = []
for item in items:
i += 1
print i
text = item.replace('<br>','')
text.replace('n','').replace(' ','') + 'n'
print text
f.write(text)
f.close()
(4)
#coding:utf-8
'''
模拟登陆163邮箱并下载邮件内容
'''
import urllib
import urllib2
import cookielib
import re
import time
import json
class Email163:
header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
user = ''
cookie = None
sid = None
mailBaseUrl='http://twebmail.mail.163.com'
def __init__(self):
self.cookie = cookielib.CookieJar()
cookiePro = urllib2.HTTPCookieProcessor(self.cookie)
urllib2.install_opener(urllib2.build_opener(cookiePro))
def login(self,user,pwd):
'''
登录
'''
postdata = urllib.urlencode({
'username':user,'password':pwd,'type':1
})
#注意版本不同,登录URL也不同
req = urllib2.Request(
url='https://ssl.mail.163.com/entry/coremail/fcg/ntesdoor2?funcid=loginone&language=-1&passtype=1&iframe=1&product=mail163&from=web&df=email163&race=-2_45_-2_hz&module=&uid='+user+'&style=10&net=t&skinid=null',data=postdata,headers=self.header,)
res = str(urllib2.urlopen(req).read())
#print res
patt = re.compile('sid=([^"]+)',re.I)
patt = patt.search(res)
uname = user.split('@')[0]
self.user = user
if patt:
self.sid = patt.group(1).strip()
#print self.sid
print '%s Login Successful.....'%(uname)
else:
print '%s Login failed....'%(uname)
def getInBox(self):
'''
获取邮箱列表
'''
print 'nGet mail lists.....n'
sid = self.sid
url = self.mailBaseUrl+'/jy3/list/list.do?sid='+sid+'&fid=1&fr=folder'
res = urllib2.urlopen(url).read()
#获取邮件列表
mailList = []
patt = re.compile('<divs+class="tdLike Ibx_Td_From"[^>]+>.*?href="([^"]+)"[^>]+>(.*?)</a>.*?<divs+class="tdLike Ibx_Td_Subject"[^>]+>.*?href="[^>]+>(.*?)</a>',re.I|re.S)
patt = patt.findall(res)
if patt==None:
return mailList
for i in patt:
line = {
'from':i[1].decode('utf8'),'url':self.mailBaseUrl+i[0],'subject':i[2].decode('utf8')
}
mailList.append(line)
return mailList
def getMailMsg(self,url):
'''
下载邮件内容
'''
content=''
print 'n Download.....%sn'%(url)
res = urllib2.urlopen(url).read()
patt = re.compile('contentURL:"([^"]+)"',re.I)
patt = patt.search(res)
if patt==None:
return content
url = '%s%s'%(self.mailBaseUrl,patt.group(1))
time.sleep(1)
res = urllib2.urlopen(url).read()
Djson = json.JSONDecoder(encoding='utf8')
jsonRes = Djson.decode(res)
if 'resultVar' in jsonRes:
content = Djson.decode(res)['resultVar']
time.sleep(3)
return content
'''
Demon
'''
#初始化
mail163 = Email163()
#登录
mail163.login('lpe234@163.com','944898186')
time.sleep(2)
#获取收件箱
elist = mail163.getInBox()
#获取邮件内容
for i in elist:
print '主题:%s 来自:%s 内容:n%s'%(i['subject'].encode('utf8'),i['from'].encode('utf8'),mail163.getMailMsg(i['url']).encode('utf8'))
(5)需要登陆的情况
#1 cookie的处理
import urllib2,cookielib
cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener = urllib2.build_opener(cookie_support,urllib2.HTTPHandler)
urllib2.install_opener(opener)
content = urllib2.urlopen('http://XXXX').read()
#2 用代理和cookie
opener = urllib2.build_opener(proxy_support,cookie_support,urllib2.HTTPHandler)
#3 表单的处理
import urllib
postdata=urllib.urlencode({
'username':'XXXXX','password':'XXXXX','continueURI':'http://www.verycd.com/','fk':fk,'login_submit':'登录'
})
req = urllib2.Request(
url = 'http://secure.verycd.com/signin/*/http://www.verycd.com/',data = postdata
)
result = urllib2.urlopen(req).read()
#4 伪装成浏览器访问
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
req = urllib2.Request(
url = 'http://secure.verycd.com/signin/*/http://www.verycd.com/',data = postdata,headers = headers
)
#5 反”反盗链”
headers = {
'Referer':'http://www.cnbeta.com/articles'
}
(6)多线程
from threading import Thread
from Queue import Queue
from time import sleep
#q是任务队列
#NUM是并发线程总数
#JOBS是有多少任务
q = Queue()
NUM = 2
JOBS = 10
#具体的处理函数,负责处理单个任务
def do_somthing_using(arguments):
print arguments
#这个是工作进程,负责不断从队列取数据并处理
def working():
while True:
arguments = q.get()
do_somthing_using(arguments)
sleep(1)
q.task_done()
#fork NUM个线程等待队列
for i in range(NUM):
t = Thread(target=working)
t.setDaemon(True)
t.start()
#把JOBS排入队列
for i in range(JOBS):
q.put(i)
#等待所有JOBS完成
q.join()
scrapy框架 刚开始学习这个框架。不太好评论。只是感觉这个框架有些Java的感觉,需要太多的其他模块的支持。 (一)创建 scrapy 项目 # 使用 scrapy startproject scrapy_test ├── scrapy_test │ ├── scrapy.cfg │ └── scrapy_test │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py # 进行创建 scrapy 项目 (二)说明 scrapy.cfg: 项目配置文件 依赖包比较麻烦。 # python-dev 包的安装 apt-get install python-dev # twisted,w3lib,six,queuelib,cssselect, libxslt pip install w3lib pip install twisted pip install lxml apt-get install libxml2-dev libxslt-dev apt-get install python-lxml pip install cssselect pip install pyOpenSSL sudo pip install service_identity # 安装好之后,便可使用 scrapy startproject test 进行创建项目 (四)抓取实例。 dizzy@dizzy-pc:~/Python/spit$ scrapy startproject itzhaopin New Scrapy project 'itzhaopin' created in: /home/dizzy/Python/spit/itzhaopin You can start your first spider with: cd itzhaopin scrapy genspider example example.com dizzy@dizzy-pc:~/Python/spit$ dizzy@dizzy-pc:~/Python/spit$ cd itzhaopin dizzy@dizzy-pc:~/Python/spit/itzhaopin$ tree . ├── itzhaopin │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ └── __init__.py └── scrapy.cfg # scrapy.cfg: 项http://my.oschina.net/lpe234/admin/new-blog目配置文件 # items.py: 需要提取的数据结构定义文件 # pipelines.py:管道定义,用来对items里面提取的数据做进一步处理,如保存等 # settings.py: 爬虫配置文件 # spiders: 放置spider的目录 (编辑:安卓应用网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
