简单的爬虫,从 html 中提取表格信息
发布时间:2020-05-25 00:49:44 所属栏目:Python 来源:互联网
导读:简单的爬虫,从 html 中提取表格信息
|
下面是脚本之家 jb51.cc 通过网络收集整理的代码片段。 脚本之家小编现在分享给大家,也给大家做个参考。 #!/usr/bin/env python
#coding=utf8
try:
import os
import urllib
import pycurl
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
from pyquery import PyQuery as pyq
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
except ImportError:
print >> sys.stderr,"""
There was a problem importing one of the Python modules required.
The error leading to this problem was:
%s
Please install a package which provides this module,or
verify that the module is installed correctly.
It's possible that the above module doesn't match the current version of Python,which is:
%s
""" % (sys.exc_info(),sys.version)
sys.exit(1)
__prog__ = "crawl"
__site__ = "http://www.oschina.net/code"
__version__ = "1.0"
class HttpRequest(object):
curl = None
def __init__(self):
self.url = None
self.url_para = None
self.curl = pycurl.Curl()
self.curl.setopt(pycurl.VERBOSE,0)
self.curl.setopt(pycurl.USERAGENT,'Miozilla/4.0 (compatible; MSIE 8.0; WindowsNT 6.1)')
self.curl.setopt(pycurl.HEADER,1)
self.curl.setopt(pycurl.FOLLOWLOCATION,1)
self.curl.setopt(pycurl.MAXREDIRS,5)
self.curl.setopt(pycurl.COOKIEFILE,'cookie.dat')
self.curl.setopt(pycurl.COOKIEJAR,'cookie.dat')
self.curl.setopt(pycurl.HTTPGET,1)
self.curl.setopt(pycurl.ENCODING,'gzip,deflate')
self.curl.setopt(pycurl.CONNECTTIMEOUT,60)
self.curl.setopt(pycurl.TIMEOUT,300)
def set_url_para(self,para):
self.url_para = para
url = self.url + para
self.curl.setopt(pycurl.URL,url)
def set_post_para(self,para):
self.curl.setopt(pycurl.POST,1)
self.curl.setopt(pycurl.POSTFIELDS,urllib.urlencode(para))
def set_cookie(self,cookie):
self.curl.setopt(pycurl.COOKIE,cookie)
def dry_write(self,buf):
pass
def download(self,url,file_path):
dir = os.path.dirname(file_path)
if not os.path.exists(dir):
os.makedirs(dir)
self.curl.setopt(pycurl.URL,url)
self.curl.setopt(pycurl.HEADER,False)
self.curl.setopt(pycurl.HEADERFUNCTION,self.dry_write) #忽略包头信息,否则会写入文件?!
with open(file_path,'wb') as outfile:
self.curl.setopt(pycurl.WRITEFUNCTION,outfile.write)
try:
self.curl.perform()
except Exception,e:
self.curl.close()
def perform(self,referer=''):
assert url,'url is null!'
self.curl.setopt(pycurl.URL,url)
referer and self.curl.setopt(pycurl.REFERER,referer)
self.buf = StringIO()
self.head = StringIO()
self.curl.setopt(pycurl.WRITEFUNCTION,self.buf.write)
self.curl.setopt(pycurl.HEADERFUNCTION,self.head.write)
try:
self.curl.perform()
self.r = self.buf.getvalue()
self.h = self.head.getvalue()
self.code = self.curl.getinfo(pycurl.HTTP_CODE)
self.info = self.curl.getinfo(pycurl.EFFECTIVE_URL)
self.cookie = self.curl.getinfo(pycurl.INFO_COOKIELIST)
self.curl.setopt(pycurl.REFERER,self.info) #AUTO REFERER
except Exception,e:
self.curl.close()
self.buf.close()
self.head.close()
def __del__(self):
self.curl.close()
def get_body(self):
return self.r
def get_head(self):
return self.h
def get_code(self):
return self.code
def get_info(self):
return self.info
def get_cookie(self):
return self.cookie
if __name__ == '__main__':
asp_range = xrange(1,10)
page_range = xrange(1,10)
crawl = HttpRequest()
for i in asp_range:
for j in page_range:
url = 'http://www.nbbicycle.com/html/116/s%d.asp?i=1&page=%d' % (i,j)
try:
crawl.perform(url)
doc = pyq(crawl.get_body())
content = doc('.contd')
print content.children('div').eq(0).text()
for tr in content.items('tr'):
print tr.text()
except Exception,e:
print e
以上是脚本之家(jb51.cc)为你收集整理的全部代码内容,希望文章能够帮你解决所遇到的程序开发问题。 如果觉得脚本之家网站内容还不错,欢迎将脚本之家网站推荐给程序员好友。 (编辑:安卓应用网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
