[color=#808080][i]这是源代码[/i][/color][color=#808080][i] [/i][/color][color=#808080][i]# coding=utf-8 [/i][/color][color=#000080][b]import [/b][/color]re [color=#000080][b]import [/b][/color]sys [color=#000080][b]import [/b][/color]time
[color=#000080][b]import [/b][/color]requests [color=#000080][b]from [/b][/color]lxml [color=#000080][b]import [/b][/color]etree
[color=#000080][b]import [/b][/color]importlib
importlib.reload(sys)
[color=#808080][i]# 定义一个爬虫 [/i][/color][color=#000080][b]class [/b][/color]spider([color=#000080]object[/color]): [color=#000080][b]def [/b][/color][color=#b200b2]init/color: [color=#000080]print/color
[color=#808080][i]# getsource用来获取网页源代码
[/i][/color][color=#808080][i] [/i][/color][color=#000080][b]def [/b][/color]getsource([color=#94558d]self[/color], url): html = requests.get(url) [color=#000080][b]return [/b][/color]html.text
[color=#808080][i]# changepage用来生产不同页数的链接
[/i][/color][color=#808080][i] [/i][/color][color=#000080][b]def [/b][/color]changepage([color=#94558d]self[/color], url, total_page): [color=#000080][b]if [/b][/color]re.search([color=#008080][b]'index_(\d+)'[/b][/color], url, re.S): now_page = [color=#000080]int/color'[/b][/color], url, re.S).group([color=#0000ff]1[/color])) [color=#808080][i]# 可修改 [/i][/color][color=#808080][i] [/i][/color][color=#000080][b]else[/b][/color]: now_page = [color=#0000ff]0 [/color][color=#0000ff] [/color]page_group = [] [color=#000080][b]for [/b][/color]i [color=#000080][b]in [/b][/color][color=#000080]range[/color](now_page, total_page + [color=#0000ff]1[/color]): link = re.sub([color=#008080][b]'index_\d+'[/b][/color], [color=#008080][b]'index_%s' [/b][/color]% i, url, re.S) [color=#808080][i]# 可修改 [/i][/color][color=#808080][i] [/i][/color]page_group.append(link) [color=#000080][b]return [/b][/color]page_group
[color=#808080][i]# getpic用来爬取一个网页图片
[/i][/color][color=#808080][i] [/i][/color][color=#000080][b]def [/b][/color]getpic([color=#94558d]self[/color], source): selector = etree.HTML(source) pic_url = selector.xpath([color=#008080][b]'//ul[@class="ali"]/li/div/a/img/@src'[/b][/color]) [color=#808080][i]# 可修改 [/i][/color][color=#808080][i] [/i][/color][color=#000080][b]return [/b][/color]pic_url
[color=#808080][i]# savepic用来保存结果到pic文件夹中
[/i][/color][color=#808080][i] [/i][/color][color=#000080][b]def [/b][/color]savepic([color=#94558d]self[/color], pic_url): picname = re.findall([color=#008080][b]'(\d+)'[/b][/color], link, re.S) [color=#808080][i]# 可修改 [/i][/color][color=#808080][i] [/i][/color]picnamestr = [color=#008080][b]''[/b][/color].join(picname) i = [color=#0000ff]0 [/color][color=#0000ff] [/color][color=#808080][i]#------------------------------------------下面这段没有执行!!!----------------------------------------- [/i][/color][color=#808080][i] [/i][/color][color=#000080][b]for [/b][/color]each [color=#000080][b]in [/b][/color]pic_url: [color=#000080]print[/color]([color=#008080][b]'now downloading:{}'[/b][/color].format(each)) pic = requests.get(each) fp = [color=#000080]open[/color]([color=#008080][b]'pic[/b][/color][color=#000080][b]\[/b][/color][color=#008080][b]' [/b][/color]+ picnamestr + [color=#008080][b]'-' [/b][/color]+ [color=#000080]str/color + [color=#008080][b]'.jpg'[/b][/color], [color=#008080][b]'wb'[/b][/color]) fp.write(pic.content) fp.close() i += [color=#0000ff]1 [/color][color=#0000ff] [/color][color=#0000ff] [/color][color=#808080][i]# ppic集合类的方法 [/i][/color][color=#808080][i] [/i][/color][color=#808080][i] [/i][/color][color=#000080][b]def [/b][/color]ppic([color=#94558d]self[/color], link): [color=#000080]print/color html = picspider.getsource(link) pic_url = picspider.getpic(html) picspider.savepic(pic_url)
[color=#000080][b]if [/b][/color]name == [color=#008080][b]'main'[/b][/color]: start = time.time() url = [color=#008080][b]'http://www.baidu.com/' [/b][/color][color=#808080][i]# 可修改 [/i][/color][color=#808080][i] [/i][/color]picspider = spider() all_links = picspider.changepage(url, [color=#0000ff]3[/color]) [color=#808080][i]# 可修改 [/i][/color][color=#808080][i] [/i][/color][color=#000080][b]for [/b][/color]link [color=#000080][b]in [/b][/color]all_links: picspider.ppic(link) end = time.time() [color=#000080]print[/color]([color=#008080][b]'耗时:{}'[/b][/color].format(start - end))