爬取豆瓣照片
发布时间:2020-05-25 16:34:00 所属栏目:Python 来源:互联网
导读:爬取豆瓣照片
|
下面是脚本之家 jb51.cc 通过网络收集整理的代码片段。 脚本之家小编现在分享给大家,也给大家做个参考。 #!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2014-12-22 14:46:40
# @Author : Your Name ([emailprotected])
# @Link : http://example.org
# @Version : $Id$
import os
import urllib.request
import re
import time
header={"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","Accept-Encoding": "gzip,deflate,sdch","Cache-Control": "max-age=0","Accept-Language": "zh-cn,zh;q=0.8;","Connection": "keep-alive","Host": "www.douban.com","Referer": "http://www.douban.com","User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko)"
" Chrome/39.0.2171.95 Safari/537.36"
}
def getHtml1(url):
req = urllib.request.Request(url,headers = header)
html = urllib.request.urlopen(req).read().decode('utf-8')
return html
def getHtml(url):
u = urllib.request.URLopener()
u.addheaders = []
u.addheader("User-Agent","Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko)"
" Chrome/39.0.2171.95 Safari/537.36")
u.addheader("Accept-Language","zh-cn,)
u.addheader("Accept","text/html,*/*;q=0.8")
f = u.open(url)
content = f.read().decode('utf-8')
f.close()
return content
def getPicURL(html):
#reg = r"http://img3.douban.com/view/photo/thumb/public/pd+.jpg"
reg1 = r"http://www.douban.com/online/11865076/photo/d+/?sortby=time"
picURLs = re.findall(reg1,html)
return picURLs
def openPic(picURL):
try:
html = getHtml(picURL)
reg = r'<img src="http://imgd{1}.douban.com/view/photo/photo/public/pd{10}.jpg"'
picURL = re.findall(reg,html)
#print(picURL)
picURL_open = picURL[0].split('"')
except:
print("!!!!!!!!!!!!!!!!!!!!!WARING:AN ERROR HAPPENED while openPic!!!!!!!!!!!!!!!!!!!!!")
return picURL_open[1]
def picDownload(picURLs,page_num):
try:
download_img = ''
dirs = os.listdir("C:UsersLyleDesktopdouPIC")
for picURL in picURLs:
picURL_new = openPic(picURL)
if picURL_new[-15:] not in dirs:
file_name = picURL_new[-15:]
download_img = urllib.request.urlretrieve(picURL_new,"C:UsersLyleDesktopdouPIC%s" % (file_name))
dirs.append(file_name)
else:
file_name = "副本" + str(picURL_new[-15:])
download_img = urllib.request.urlretrieve(picURL_new,"C:UsersLyleDesktopdouPIC%s" % (file_name))
time.sleep(1)
print("第%d页 第%d张 ......%s......... downloaded" % (page_num+1,picURLs.index(picURL)+1,picURL_new[-15:]))
except:
print("!!!!!!!!!!!!!!!!!!!!!WARING:AN ERROR happened wile picDownload!!!!!!!!!!!!!!!!!!!!!")
return download_img
if __name__ == '__main__':
num = 0
page_num = 0
while True:
html = getHtml(r'http://www.douban.com/online/11865076/album/137771083/?start=%d&sortby=time' % (num+page_num*90))
picURLs = getPicURL(html)
print("**************第%d页下载开始***************" % (page_num+1))
picDownload(picURLs,page_num)
print("**************第%d页下载完成***************" % (page_num+1))
page_num += 1
以上是脚本之家(jb51.cc)为你收集整理的全部代码内容,希望文章能够帮你解决所遇到的程序开发问题。 如果觉得脚本之家网站内容还不错,欢迎将脚本之家网站推荐给程序员好友。 (编辑:安卓应用网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
推荐文章
站长推荐
热点阅读
