0%

不务正业系列:爬虫小子(偶尔更新)

爬虫小子:二次元的福音

计算机底层知识学累了,偶尔搞点不一定的来奖励自己(大雾)

最近在学习《Python网络数据采集》这本书,了解了许多与网络相关的知识,也体验了一下面向对象的程序设计,当然主要是学习爬虫的相关知识

自我感觉爬虫很容易上手,但精通可能够呛(特别是在反爬技术越来越高超的当下),这里就记录一下我的写的垃圾爬虫吧……

爬虫小子_V1.0

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from urllib.request import urlopen
from bs4 import BeautifulSoup

import requests as req
import os

def GetURL():
target = input("please input your target:")
html = urlopen(target)
bsObj = BeautifulSoup(html, "html.parser")
images = bsObj.findAll("img", {"class": "rich_pages wxw-img"})
return images

def CreateImages(images,name):
i=0
for image in images:
i = i + 1
response = image.attrs["data-src"]
print("finish!!!, now you get "+str(i)+" images")
num = "test_" + str(i)
dtype = response.split('=')[-1]
num += '.' + dtype
response = req.get(response)
os.makedirs(name, exist_ok=True)
with open(name+ '/' +num, 'wb')as f:
f.write(response.content)

images=GetURL()
CreateImages(images,name=input("please input its name:"))

看看效果:

这里就不放图了,饥渴的兄弟可以自己爬一下……

更新日志:

  • version:v1.0
  • date:2022.3.31
  • type:
    • Features:NULL
    • Changed:NULL
    • Removed:NULL
  • desc:
    • 第一代版本,功能很弱

爬虫小子_V1.1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as req
import random
import time
import os

nowTime = time.time()
random.seed(nowTime)

def GetURL():
target = input("please input your target:")
html = urlopen(target)
bsObj = BeautifulSoup(html, "html.parser")
return bsObj

def GetRandomURL(link_list):
target = link_list[random.randint(0, len(link_list) - 1)]
html = urlopen(target)
bsObj = BeautifulSoup(html, "html.parser")
name = GetName(bsObj)
print("The next one is " + name)
print("Do you want to Download it?")
choice = input("Please input yes or no\n")
if choice == "yes":
print("You want to Continue OK~~~")
images = Getimage(bsObj)
Download(images, name)
elif choice == "no":
print("Try again?")
choice = input("Please input yes or no\n")
if choice == "yes":
GetRandomURL(link_list)
else:
Exit()
else:
print("Wrong choice~~~~")
Exit()

def More():
print("Do you want more?")
choice = input("Please input yes or no\n")
if choice == "yes":
print("hahaha I know you~~~~")
Return()
elif choice == "no":
Exit()
else:
print("Wrong choice~~~~")
Exit()
More()

def Return():
target = "https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MzUzNjA0MjkxMw==&action=getalbum&album_id=2119868775733706753&scene=173&from_msgid=2247561088&from_itemidx=1&count=3&nolastread=1#wechat_redirect"
html = urlopen(target)
bsObj = BeautifulSoup(html, "html.parser")
link_list = GetLinks(bsObj)
GetRandomURL(link_list)

def Getimage(bsObj):
images = bsObj.findAll("img", {"class": "rich_pages wxw-img"})
return images

def GetName(bsObj):
title = bsObj.find("h1", {"class": "rich_media_title"})
name = "image-" + title.string.split('【二次元壁纸分享】')[-1][:3]
return name

def GetLinks(bsObj):
links = bsObj.findAll("li",{"class": "album__list-item js_album_item js_wx_tap_highlight wx_tap_cell"})
link_list = []
for link in links:
if link.attrs['data-link'] is not None:
if link.attrs['data-link'] not in link_list:
link_list.append(link.attrs['data-link'])
return link_list

def Download(images,name):
i=0
for image in images:
i=i+1
print("finish!!!, now you get " + str(i) + " images")
response = image.attrs["data-src"]
num = name + "-" + str(i)
dtype = response.split('=')[-1]
num += '.' + dtype
response = req.get(response)
os.makedirs(name, exist_ok=True)
with open(name+ '/' +num, 'wb')as f:
f.write(response.content)

def Exit():
print("exit~~~~bye~~~~")
exit(0)

bsObj=GetURL()
images=Getimage(bsObj)
name=GetName(bsObj)
Download(images,name)
More()

更新日志:

  • version:v1.1
  • date:2022.4.2
  • type:
    • Features:Download完毕以后可以选择继续Download,可以索引部分内部链接用于Download,加强了爬虫的互动性
    • Changed:NULL
    • Removed:删除了输入文件名的操作
  • desc:
    • 功能有些许加强,但没有质变

爬虫小子_V1.2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as req
import os

n = 0

def GetNextURL(target):
html = urlopen(target)
bsObj = BeautifulSoup(html, "html.parser")
name = GetName(bsObj)
print("The next one is " + name)
return bsObj

def GetAllURL():
target = "https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MzUzNjA0MjkxMw==&action=getalbum&album_id=2119868775733706753&scene=173&from_msgid=2247561088&from_itemidx=1&count=3&nolastread=1#wechat_redirect"
html = urlopen(target)
bsObj = BeautifulSoup(html, "html.parser")
link_list = GetLinks(bsObj)
return link_list

def Getimage(bsObj):
images = bsObj.findAll("img", {"class": "rich_pages wxw-img"})
return images

def GetName(bsObj):
title = bsObj.find("h1", {"class": "rich_media_title"})
name = "image-" + title.string.split('【二次元壁纸分享】')[-1][:3]
return name

def GetLinks(bsObj):
links = bsObj.findAll("li",{"class": "album__list-item js_album_item js_wx_tap_highlight wx_tap_cell"})
link_list = []
for link in links:
if link.attrs['data-link'] is not None:
if link.attrs['data-link'] not in link_list:
link_list.append(link.attrs['data-link'])
return link_list

def DownloadApart(images,name):
global n
i = 0
for image in images:
i += 1
n += 1
print("finish!!!, now you get " + str(n) + " images")
response = image.attrs["data-src"]
num = name + "-" + str(i)
dtype = response.split('=')[-1]
num += '.' + dtype
response = req.get(response)
os.makedirs(name, exist_ok=True)
with open(name+ '/' +num, 'wb')as f:
f.write(response.content)

def DownloadTogether(images,name):
global n
i = 0
for image in images:
i += 1
n += 1
print("finish!!!, now you get " + str(n) + " images")
response = image.attrs["data-src"]
num = name + "-" + str(i)
dtype = response.split('=')[-1]
num += '.' + dtype
response = req.get(response)
with open(num, 'wb')as f:
f.write(response.content)

print("Do you want to Together or Apart?")
choice = input("Please input 't' for Together 'a' for Apart\n")

link_list = GetAllURL()
for link in link_list:
bsObj = GetNextURL(link)
images = Getimage(bsObj)
name = GetName(bsObj)

if choice == "t":
DownloadTogether(images,name)
elif choice == "a":
DownloadApart(images,name)
else:
print("Wrong choice~~~~")

print("All images finish!!!!!")

一口气干了 181 张,可以可以……

更新日志:

  • version:v1.2
  • date:2022.4.2
  • type:
    • Features:可以选择把图片分开进行存储或者统一存储
    • Changed:改变了设计思路,点开即使用,不需要过多的操作
    • Removed:删除了大量的控制操作
  • desc:
    • v1.1 的变种,旨在快速获取大量图片,减少操作频率,但程序也因此不可控制,会重复Download相同的图片
    • 有时需要手动调整输出文件的目录

爬虫小子_V1.3

桌宠特供版,对代码进行了优化,并把函数用类组织起来

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as req
import os

class Crawler():
def __init__(self):
self.stop = 0
self.n = 0

def GetNextURL(self,target):
self.html = urlopen(target)
self.bsObj = BeautifulSoup(self.html, "html.parser")
self.name = self.GetName(self.bsObj)
if self.name == None:
print("Something wrong...")
else:
print("The next one is " + self.name)
return self.bsObj

def GetAllURL(self):
#self.target = "https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MzUzNjA0MjkxMw==&action=getalbum&album_id=2119868775733706753&scene=173&from_msgid=2247561088&from_itemidx=1&count=3&nolastread=1#wechat_redirect"
self.target = "https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MzUzNjA0MjkxMw==&action=getalbum&album_id=2342983337092759553&scene=173&from_msgid=2247561474&from_itemidx=1&count=3&nolastread=1#wechat_redirect"
self.html = urlopen(self.target)
self.bsObj = BeautifulSoup(self.html, "html.parser")
self.link_list = self.GetLinks(self.bsObj)
return self.link_list

def Getimage(self,bsObj):
self.images = bsObj.findAll("img", {"class": "rich_pages wxw-img"})
return self.images

def GetName(self,bsObj):
self.title = bsObj.find("h1", {"class": "rich_media_title"})
self.name = self.title.string.split('【二次元动漫壁纸】')[-1][:3]
if self.name.isdigit():
return "image-" + self.name
else:
self.ame = self.title.string.split('【二次元壁纸分享】')[-1][:3]
if self.name.isdigit():
return "image-" + self.name
else:
return None

def GetLinks(self,bsObj):
self.links = bsObj.findAll("li",{"class": "album__list-item js_album_item js_wx_tap_highlight wx_tap_cell"})
self.link_list = []
for link in self.links:
if link.attrs['data-link'] is not None:
if link.attrs['data-link'] not in self.link_list:
self.link_list.append(link.attrs['data-link'])
return self.link_list

def DownloadTogether(self,images,name):
self.i = 0
for image in images:
self.i += 1
self.n += 1
self.response = image.attrs["data-src"]
self. num = name + "-" + str(self.i)
self.dtype = self.response.split('=')[-1]
self.num += '.' + self.dtype
self.response = req.get(self.response)
print("finish!!!, now you get " + str(self.n) + " => "+self.num)
self.path = os.path.join("D:\\PythonProject\\Images",self.num)
with open(self.path, 'wb')as f:
f.write(self.response.content)

def Start(self):
self.link_list = self.GetAllURL()
for link in self.link_list:
if self.stop == 1:
print("OK quit....")
return None
self.bsObj = self.GetNextURL(link)
self.images = self.Getimage(self.bsObj)
self.name = self.GetName(self.bsObj)
if self.name != None:
self.DownloadTogether(self.images,self.name)

def Stop(self):
self.stop = 1

if __name__=="__main__":
crawler = Crawler()
crawler.Start()

更新日志:

  • version:v1.3
  • date:2022.5.14
  • type:
    • Features:
      • 用类对函数进行了组织
      • 添加了 Stop 功能
      • 修了一些 BUG
    • Changed:
      • 固定了文件保存的路径(用绝对路径写死)
    • Removed:
      • 删除了分离储存,现在只能进行统一存储
  • desc:
    • 这款爬虫是为桌宠准备的,在功能上几乎没有提升(甚至还有阉割)