pythonץȡͼƬʾÀý
·¢²¼Ê±¼ä£º2020-05-24 23:32:24 ËùÊôÀ¸Ä¿£ºPython À´Ô´£º»¥ÁªÍø
µ¼¶Á£ºpythonץȡͼƬʾÀý
|
ÏÂÃæÊǽű¾Ö®¼Ò jb51.cc ͨ¹ýÍøÂçÊÕ¼¯ÕûÀíµÄ´úÂëÆ¬¶Î¡£ ½Å±¾Ö®¼ÒС±àÏÖÔÚ·ÖÏí¸ø´ó¼Ò£¬Ò²¸ø´ó¼Ò×ö¸ö²Î¿¼¡£ #!/usr/bin/python
# -*- coding:utf-8 -*-
import re
import os
import urllib,urllib2,cookielib
import shutil
from BeautifulSoup import BeautifulSoup
# ---- utils ----
def normalize_url(url):
return "http://" + url if cmp(url[0:7],"http://") != 0 else url
def safeDir(dir):
return dir.replace('/','')
# ---- variable ----
homepagePrefix = "http://60dxw.comww1.baisex.me/forum-47-"
homepageSuffix = ".html"
threadPrefix = "http://60dxw.comww1.baisex.me/"
homedir = "baixingge"
# ---- login ----
cookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener = urllib2.build_opener(cookie)
# ---- file ----
if (os.path.exists(homedir) == False):
os.mkdir(homedir)
os.chdir(homedir)
# ---- crawl ----
for page in range(1,25):
pageUrl = '{0}{1}{2}'.format(homepagePrefix,page,homepageSuffix)
# ---- mkdir ----
if (os.path.exists(str(page)) == False):
os.mkdir(str(page))
os.chdir(str(page))
print pageUrl
# ---- download ----
html_body = urllib.urlopen(pageUrl).read()
soup = BeautifulSoup(html_body)
# ---- extract ----
threaddUrls = []
urlRaws = soup.findAll('th',attrs = {'class' : ['new','common']})
urlPattern = re.compile(r'href="([^"]*)"')
titlePattern = re.compile(r'>([^<]*)</a>')
for urlRaw in urlRaws:
h = urlPattern.search(str(urlRaw))
t = titlePattern.search(str(urlRaw))
threadUrl = h.group(1)
threadTitle = t.group(1)
if (os.path.exists(threadTitle) == False):
os.mkdir(safeDir(threadTitle))
else:
continue
os.chdir(safeDir(threadTitle))
page_url = threadPrefix + threadUrl
print "---->{0}".format(page_url)
print "---->{0}".format(safeDir(threadTitle))
page_body = urllib.urlopen(page_url).read()
page_soup = BeautifulSoup(page_body)
imgPattern = re.compile(r'img src="([^"]*)" onload')
i = imgPattern.findall(str(page_soup))
index = 0
for img in i:
print "-------->{0}".format(img)
imgSuffix = img[img.rindex('.'):]
imgName = "{0}{1}".format(str(index),imgSuffix)
urllib.urlretrieve(img,imgName,None)
index += 1
os.chdir("../")
os.chdir("../")
ÒÔÉÏÊǽű¾Ö®¼Ò(jb51.cc)ΪÄãÊÕ¼¯ÕûÀíµÄÈ«²¿´úÂëÄÚÈÝ£¬Ï£ÍûÎÄÕÂÄܹ»°ïÄã½â¾öËùÓöµ½µÄ³ÌÐò¿ª·¢ÎÊÌâ¡£ Èç¹û¾õµÃ½Å±¾Ö®¼ÒÍøÕ¾ÄÚÈÝ»¹²»´í£¬»¶Ó½«½Å±¾Ö®¼ÒÍøÕ¾ÍÆ¼ö¸ø³ÌÐòÔ±ºÃÓÑ¡£ £¨±à¼£º°²×¿Ó¦ÓÃÍø£© ¡¾ÉùÃ÷¡¿±¾Õ¾ÄÚÈݾùÀ´×ÔÍøÂ磬ÆäÏà¹ØÑÔÂÛ½ö´ú±í×÷Õ߸öÈ˹۵㣬²»´ú±í±¾Õ¾Á¢³¡¡£ÈôÎÞÒâÇÖ·¸µ½ÄúµÄȨÀû£¬Ç뼰ʱÓëÁªÏµÕ¾³¤É¾³ýÏà¹ØÄÚÈÝ! |
Ïà¹ØÄÚÈÝ
- ʹÓÃPythonʹÓÃTumblr APIʱÎÞ·¨»ñµÃOAuth¡°ÇëÇóÁîÅÆ¡±
- Python ÕýÔò±í´ïʽÅÀ³æÊ¹Óð¸Àý½âÎö
- pythonÄ£¿é¼ò½éÖ®ÓÐÐò×ֵ䣨OrderedDict£©
- ¡¾Python¡¿pythonÎļþ´ò¿ª·½Ê½Ïê½â¡ª¡ªa¡¢a+¡¢r+¡¢w+Çø±ð
- ʵÀý½²½âPython±à³ÌÖÐ@property×°ÊÎÆ÷µÄÓ÷¨
- pythonÖÐurllib.unquoteÂÒÂëµÄÔÒòÓë½â¾ö·½·¨
- python ¨C ÐÜ裺°´Ë÷ÒýÖµ·Ö×é,È»ºó¼ÆËã·ÖλÊý£¿
- python Ð³Ì geventÔÀíÓëÓ÷¨·ÖÎö
- pythonµÄTqdmÄ£¿éµÄʹÓÃ
- restructuredText,docstringºÍpython½»»¥Ê½shell
ÍÆ¼öÎÄÕÂ
Õ¾³¤ÍƼö
- python»ù´¡Ñ§Ï°16----Ä£¿é
- python ¨C ͨ¹ýûÓÐÑ»·µÄ2DË÷ÒýÊý×éË÷Òý2D nump
- PythonÖÐÉú³ÉEpochµÄ·½·¨
- Ä£ÄâµÇ¼·â°üpythonʵÏÖ
- ʹÓÃdictwriter¸²¸ÇÏàͬcsvÎļþÖеÄÐÐ
- ¡¾Python¡¿ÊÕ¼¯µÄ¸ß¼¶º¯Êý¡¢¹¦ÄÜ
- PythonÖÐstr.format()Ïê½â
- python win32 ¼òµ¥²Ù×÷·½·¨
- python ¨C ´ÓÎļþÖлñÈ¡Êý¾Ý,¶ø²»ÊǶà´Îµü´úËü
- pythonʵÏÖµÄÓÃÓÚËÑË÷Îļþ²¢½øÐÐÄÚÈÝÌæ»»µÄÀàʵÀý
ÈȵãÔĶÁ
