¼ÓÈëÊÕ²Ø | ÉèΪÊ×Ò³ | »áÔ±ÖÐÐÄ | ÎÒҪͶ¸å °²×¿Ó¦ÓÃÍø £¨https://www.0791zz.com/£©- ¿Æ¼¼¡¢½¨Õ¾¡¢¾­Ñé¡¢ÔÆ¼ÆËã¡¢5G¡¢´óÊý¾Ý,Õ¾³¤Íø!
µ±Ç°Î»Ö㺠Ê×Ò³ > ±à³Ì¿ª·¢ > Python > ÕýÎÄ

pythonץȡͼƬʾÀý

·¢²¼Ê±¼ä£º2020-05-24 23:32:24 ËùÊôÀ¸Ä¿£ºPython À´Ô´£º»¥ÁªÍø
µ¼¶Á£ºpythonץȡͼƬʾÀý

ÏÂÃæÊǽű¾Ö®¼Ò jb51.cc ͨ¹ýÍøÂçÊÕ¼¯ÕûÀíµÄ´úÂëÆ¬¶Î¡£

½Å±¾Ö®¼ÒС±àÏÖÔÚ·ÖÏí¸ø´ó¼Ò£¬Ò²¸ø´ó¼Ò×ö¸ö²Î¿¼¡£

#!/usr/bin/python
# -*- coding:utf-8 -*-

import re
import os
import urllib,urllib2,cookielib
import shutil
from BeautifulSoup import BeautifulSoup 

# ---- utils ----
def normalize_url(url):
    return "http://" + url if cmp(url[0:7],"http://") != 0 else url

def safeDir(dir):
    return dir.replace('/','')

# ---- variable ----
homepagePrefix = "http://60dxw.comww1.baisex.me/forum-47-"
homepageSuffix = ".html"
threadPrefix = "http://60dxw.comww1.baisex.me/"
homedir = "baixingge"

# ---- login ----
cookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener = urllib2.build_opener(cookie)

# ---- file ----
if (os.path.exists(homedir) == False):
    os.mkdir(homedir)
os.chdir(homedir)

# ---- crawl ----
for page in range(1,25):
    pageUrl = '{0}{1}{2}'.format(homepagePrefix,page,homepageSuffix)
    # ---- mkdir ----
    if (os.path.exists(str(page)) == False):
        os.mkdir(str(page))
    os.chdir(str(page))
    print pageUrl

    # ---- download ----
    html_body = urllib.urlopen(pageUrl).read()
    soup = BeautifulSoup(html_body)

    # ---- extract ----
    threaddUrls = []
    urlRaws = soup.findAll('th',attrs = {'class' : ['new','common']})
    urlPattern = re.compile(r'href="([^"]*)"')
    titlePattern = re.compile(r'>([^<]*)</a>')
    for urlRaw in urlRaws: 
        h = urlPattern.search(str(urlRaw))
        t = titlePattern.search(str(urlRaw))
        threadUrl = h.group(1)
        threadTitle = t.group(1)
        if (os.path.exists(threadTitle) == False):
            os.mkdir(safeDir(threadTitle))
        else:
            continue
        os.chdir(safeDir(threadTitle))

        page_url = threadPrefix + threadUrl
        print "---->{0}".format(page_url)
        print "---->{0}".format(safeDir(threadTitle))
        page_body = urllib.urlopen(page_url).read()
        page_soup = BeautifulSoup(page_body)

        imgPattern = re.compile(r'img src="([^"]*)" onload')
        i = imgPattern.findall(str(page_soup))
        index = 0
        for img in i:
            print "-------->{0}".format(img)
            imgSuffix = img[img.rindex('.'):]
            imgName = "{0}{1}".format(str(index),imgSuffix)
            urllib.urlretrieve(img,imgName,None)
            index += 1

        os.chdir("../")
    os.chdir("../")

ÒÔÉÏÊǽű¾Ö®¼Ò(jb51.cc)ΪÄãÊÕ¼¯ÕûÀíµÄÈ«²¿´úÂëÄÚÈÝ£¬Ï£ÍûÎÄÕÂÄܹ»°ïÄã½â¾öËùÓöµ½µÄ³ÌÐò¿ª·¢ÎÊÌâ¡£

Èç¹û¾õµÃ½Å±¾Ö®¼ÒÍøÕ¾ÄÚÈÝ»¹²»´í£¬»¶Ó­½«½Å±¾Ö®¼ÒÍøÕ¾ÍÆ¼ö¸ø³ÌÐòÔ±ºÃÓÑ¡£

£¨±à¼­£º°²×¿Ó¦ÓÃÍø£©

¡¾ÉùÃ÷¡¿±¾Õ¾ÄÚÈݾùÀ´×ÔÍøÂ磬ÆäÏà¹ØÑÔÂÛ½ö´ú±í×÷Õ߸öÈ˹۵㣬²»´ú±í±¾Õ¾Á¢³¡¡£ÈôÎÞÒâÇÖ·¸µ½ÄúµÄȨÀû£¬Ç뼰ʱÓëÁªÏµÕ¾³¤É¾³ýÏà¹ØÄÚÈÝ!

    ÍÆ¼öÎÄÕÂ
      ÈȵãÔĶÁ