不管痛苦还是难受,都应该感受到生存的力量。 ----《野良神》

网站地址: www.xiaohuar.com/hua

流程:

  1. 获取所有的页面的总数
  2. 获取当前页面的所有校花的相册地址
  3. 获取校花相册中的所有图片地址
  4. 下载图片

用到的库:

  • requests
  • re
  • os
  • bs4.BeautifulSoup
  • urllib.request.urlretrieve

这个网站相对简单,就直接贴出源代码了~

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import requests
import re
import os
from bs4 import BeautifulSoup
from urllib.request import urlretrieve

class Xiaohua():
try:
os.mkdir('xiaohua')
except FileExistsError:
pass

def __init__(self):
self.url_list = []
self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}

# 获取总页面数
def get_page(self):
url = "http://www.xiaohuar.com/hua/"
get_request_page = requests.get(url,headers=self.headers).text
Bsoup = BeautifulSoup(get_request_page, 'lxml')
Bfind = Bsoup.find('div', class_="page_num").find_all('a')
re_str = re.compile(r'<a href="http://www.xiaohuar.com/list-1-(\d+).html">尾页</a>')
page = int(re_str.search(str(Bfind[-1])).groups()[0])+1
return page

# 获取当前页面的所有校花的相册地址
def get_image_url(self, page):
print("正在获取第 {} 页的校花相册地址......".format(page+1))
url = "http://www.xiaohuar.com/list-1-{}.html".format(page)
request_url = requests.get(url, headers=self.headers).text
img_url_soup = BeautifulSoup(request_url, 'lxml')
img_find = img_url_soup.find('div', class_="demo clearfix").find_all('div', class_="img")
re_name = re.compile(r'<a href="(.*?)" target="_blank"><img alt="(.*?)" src')
for img in img_find:
link = re_name.search(str(img)).groups()
self.url_list.append(re_name.search(str(img)).groups()[0].replace('p-','s-'))
print("正在获取 '{}' 的相册地址".format(link[1]))
print("第 {} 页的校花相册地址获取完成!".format(page+1 ))
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
return self.url_list.sort(reverse=False)

# 下载图片
def download(self,image_site,name):
print("正在下载 `{}`".format(image_site.split('/')[-1]))
url = "http://www.xiaohuar.com"+image_site
try:
os.mkdir('xiaohua/{}'.format(name))
except FileExistsError:
pass
urlretrieve(url, 'xiaohua/{}/{}.{}'.format(name, image_site.split('/')[-1][0:-4], image_site.split('.')[-1]))

# 获取校花相册中的所有图片地址
def get_xiaohua_total_img(self):
for i in range(len(self.url_list)):
url = self.url_list.pop()
img_request = requests.get(url, headers=self.headers).text
bs = BeautifulSoup(img_request, 'lxml')
bfind_name = bs.find('div', class_="pic_con_box ad-gallery").find_all('h1')
re_name = re.compile(r'<h1>(.*?)<span class')
xiaohau_name = re_name.search(str(bfind_name[0])).groups()[0]
bfind = bs.find('ul', class_="ad-thumb-list").find_all('a')
re_img = re.compile(r'<a class="" href="(.*?)"')
print("开始下载 `{}`".format(xiaohau_name))
for img in bfind:
imgs = re_img.search(str(img)).groups()[0]
self.download(imgs,xiaohau_name)
print("校花 `{}` 下载完成~".format(xiaohau_name))
print("***************************************")
if self.url_list == []:
break

def main(self):
page = self.get_page()
for num in range(page):
self.get_image_url(num)
self.get_xiaohua_total_img()
print("爬取完成!")

if __name__ == '__main__':
x = Xiaohua()
x.main()

效果:

xiaohua1