Python爬虫

照着简书上的教程写了个爬煎蛋无聊图的爬虫

在原来的基础上加了异常处理 这样就不会爬一半断了

贴代码:

#!/usr/bin/python
#-*- coding: utf-8 -*-
#encoding=utf-8

import socket  
import urllib2  
import urllib  
import os  
from BeautifulSoup import BeautifulSoup

def getAllImageLink(subUrl,page):  
    pageUrl = 'http://jandan.net/%s/page-%d' % (subUrl,page)
    print pageUrl
    html = urllib2.urlopen(pageUrl).read()
    soup = BeautifulSoup(html)
    commentlist = soup.findAll('ol',attrs={"class":"commentlist"})

    for liResult in commentlist:
        liResult = soup.findAll('li')
        number = 0

        for li in liResult:
            imageEntityArray = li.findAll('img')

            for image in imageEntityArray:
                srcLink = image.get('src')
                orgsrcLink = image.get('org_src')
                if orgsrcLink:
                    link = orgsrcLink
                else:
                    link = srcLink
                try:
                    imageNameList = link.split('/')
                    number+=1
                    subfix = imageNameList[-1]
                    imageName = 'page%d_%d_%s' % (page ,number , subfix[-8:])
                    filesavepath = '/Volumes/account/jiandan/%s/%s' % (subUrl,imageName)
                    urllib.urlretrieve(link,filesavepath)
                except:
                    continue

        print filesavepath 

if __name__ == '__main__':  
    socket.setdefaulttimeout(20)
    for page in range(4017,4500):
        getAllImageLink('pic',page)

引用:

iOS程序员如何使用python写网路爬虫