Skip to content

Commit e2e3650

Browse files
committed
提交代码
1 parent 9f10be5 commit e2e3650

File tree

3 files changed

+107
-0
lines changed

3 files changed

+107
-0
lines changed

.DS_Store

0 Bytes
Binary file not shown.

xianhuan/.DS_Store

2 KB
Binary file not shown.

xianhuan/doutu/doutu.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
@author: 闲欢
5+
"""
6+
7+
import threading
8+
import requests
9+
from lxml import etree
10+
import os
11+
import random
12+
import time
13+
from queue import Queue
14+
15+
headers = {
16+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
17+
'cookie' : '你的cookie'
18+
}
19+
20+
21+
class Producer(threading.Thread):
22+
def __init__(self, page_queue, img_queue, *args, **kwargs):
23+
super(Producer, self).__init__(*args, **kwargs)
24+
self.page_queue = page_queue
25+
self.img_queue = img_queue
26+
27+
def run(self):
28+
while True:
29+
if self.page_queue.empty():
30+
break
31+
# 休息几秒钟
32+
time.sleep(random.randint(1, 3))
33+
url = self.page_queue.get()
34+
self.parse_page(url)
35+
36+
def parse_page(self, url):
37+
response = requests.get(url, headers=headers)
38+
text = response.text
39+
html = etree.HTML(text)
40+
imgs = html.xpath("//div[@class='random_picture']//a//img")
41+
for img in imgs:
42+
# 过滤动图
43+
if img.get('class') == 'gif':
44+
continue
45+
46+
# 获取图片url
47+
img_url = img.xpath(".//@data-backup")[0]
48+
if img_url.split('.')[-1] == 'gif':
49+
continue
50+
51+
# 获取图片后缀
52+
suffix = os.path.splitext(img_url)[1]
53+
54+
# 获取图片名称
55+
alt = img.xpath(".//@alt")[0]
56+
57+
img_name = alt + suffix
58+
self.img_queue.put((img_url, img_name))
59+
60+
61+
class Consumer(threading.Thread):
62+
def __init__(self, page_queue, img_queue, *args, **kwargs):
63+
super(Consumer, self).__init__(*args, **kwargs)
64+
self.page_queue = page_queue
65+
self.img_queue = img_queue
66+
67+
def run(self):
68+
while True:
69+
if self.img_queue.empty() and self.page_queue.empty():
70+
return
71+
72+
img = self.img_queue.get(block=True)
73+
url, filename = img
74+
with open("./images/"+filename, 'wb') as f:
75+
f.write(requests.get(url, timeout=30, headers=headers).content)
76+
f.close()
77+
print(filename + ' 下载完成!')
78+
79+
80+
def main():
81+
# url队列
82+
page_queue = Queue(15)
83+
img_queue = Queue(20)
84+
page_queue.put('https://www.doutula.com/photo/list/')
85+
for x in range(2, 6):
86+
url = "https://www.doutula.com/photo/list/?page={}" .format(str(x))
87+
page_queue.put(url)
88+
89+
for x in range(6):
90+
t = Producer(page_queue, img_queue)
91+
t.start()
92+
93+
for x in range(6):
94+
t = Consumer(page_queue, img_queue)
95+
t.start()
96+
97+
98+
if __name__ == '__main__':
99+
main()
100+
101+
102+
103+
104+
105+
106+
107+

0 commit comments

Comments
 (0)