不务正业系列：爬虫小子（偶尔更新）

爬虫小子：二次元的福音

计算机底层知识学累了，偶尔搞点不一定的来奖励自己（大雾）

最近在学习《Python网络数据采集》这本书，了解了许多与网络相关的知识，也体验了一下面向对象的程序设计，当然主要是学习爬虫的相关知识

自我感觉爬虫很容易上手，但精通可能够呛（特别是在反爬技术越来越高超的当下），这里就记录一下我的写的垃圾爬虫吧……

爬虫小子_V1.0

from urllib.request import urlopen
from bs4 import BeautifulSoup

import requests as req
import os

def GetURL():
    target = input("please input your target:")
    html = urlopen(target)
    bsObj = BeautifulSoup(html, "html.parser")
    images = bsObj.findAll("img", {"class": "rich_pages wxw-img"})
    return images

def CreateImages(images,name):
    i=0
    for image in images:
        i = i + 1
        response = image.attrs["data-src"]
        print("finish！！！, now you get "+str(i)+" images")
        num = "test_" + str(i)
        dtype = response.split('=')[-1]
        num += '.' + dtype
        response = req.get(response)
        os.makedirs(name, exist_ok=True)
        with open(name+ '/' +num, 'wb')as f:
            f.write(response.content)

images=GetURL()
CreateImages(images,name=input("please input its name:"))

看看效果：

这里就不放图了，饥渴的兄弟可以自己爬一下……

更新日志：

version：v1.0
date：2022.3.31
type：
- Features：NULL
- Changed：NULL
- Removed：NULL
desc：
- 第一代版本，功能很弱

爬虫小子_V1.1

from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as req
import random
import time
import os

nowTime = time.time()
random.seed(nowTime)

def GetURL():
    target = input("please input your target:")
    html = urlopen(target)
    bsObj = BeautifulSoup(html, "html.parser")
    return bsObj

def GetRandomURL(link_list):
    target = link_list[random.randint(0, len(link_list) - 1)]
    html = urlopen(target)
    bsObj = BeautifulSoup(html, "html.parser")
    name = GetName(bsObj)
    print("The next one is " + name)
    print("Do you want to Download it?")
    choice = input("Please input yes or no\n")
    if choice == "yes":
        print("You want to Continue OK~~~")
        images = Getimage(bsObj)
        Download(images, name)
    elif choice == "no":
        print("Try again?")
        choice = input("Please input yes or no\n")
        if choice == "yes":
            GetRandomURL(link_list)
        else:
            Exit()
    else:
        print("Wrong choice~~~~")
        Exit()

def More():
    print("Do you want more?")
    choice = input("Please input yes or no\n")
    if choice == "yes":
        print("hahaha I know you~~~~")
        Return()
    elif choice == "no":
        Exit()
    else:
        print("Wrong choice~~~~")
        Exit()
    More()

def Return():
    target = "https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MzUzNjA0MjkxMw==&action=getalbum&album_id=2119868775733706753&scene=173&from_msgid=2247561088&from_itemidx=1&count=3&nolastread=1#wechat_redirect"
    html = urlopen(target)
    bsObj = BeautifulSoup(html, "html.parser")
    link_list = GetLinks(bsObj)
    GetRandomURL(link_list)

def Getimage(bsObj):
    images = bsObj.findAll("img", {"class": "rich_pages wxw-img"})
    return images

def GetName(bsObj):
    title = bsObj.find("h1", {"class": "rich_media_title"})
    name = "image-" + title.string.split('【二次元壁纸分享】')[-1][:3]
    return name

def GetLinks(bsObj):
    links = bsObj.findAll("li",{"class": "album__list-item js_album_item js_wx_tap_highlight wx_tap_cell"})
    link_list = []
    for link in links:
        if link.attrs['data-link'] is not None:
            if link.attrs['data-link'] not in link_list:
                link_list.append(link.attrs['data-link'])
    return link_list

def Download(images,name):
    i=0
    for image in images:
        i=i+1
        print("finish！！！, now you get " + str(i) + " images")
        response = image.attrs["data-src"]
        num = name + "-" + str(i)
        dtype = response.split('=')[-1]
        num += '.' + dtype
        response = req.get(response)
        os.makedirs(name, exist_ok=True)
        with open(name+ '/' +num, 'wb')as f:
            f.write(response.content)

def Exit():
    print("exit~~~~bye~~~~")
    exit(0)

bsObj=GetURL()
images=Getimage(bsObj)
name=GetName(bsObj)
Download(images,name)
More()

更新日志：

version：v1.1
date：2022.4.2
type：
- Features：Download完毕以后可以选择继续Download，可以索引部分内部链接用于Download，加强了爬虫的互动性
- Changed：NULL
- Removed：删除了输入文件名的操作
desc：
- 功能有些许加强，但没有质变

爬虫小子_V1.2

from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as req
import os

n = 0

def GetNextURL(target):
    html = urlopen(target)
    bsObj = BeautifulSoup(html, "html.parser")
    name = GetName(bsObj)
    print("The next one is " + name)
    return bsObj

def GetAllURL():
    target = "https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MzUzNjA0MjkxMw==&action=getalbum&album_id=2119868775733706753&scene=173&from_msgid=2247561088&from_itemidx=1&count=3&nolastread=1#wechat_redirect"
    html = urlopen(target)
    bsObj = BeautifulSoup(html, "html.parser")
    link_list = GetLinks(bsObj)
    return link_list

def Getimage(bsObj):
    images = bsObj.findAll("img", {"class": "rich_pages wxw-img"})
    return images

def GetName(bsObj):
    title = bsObj.find("h1", {"class": "rich_media_title"})
    name = "image-" + title.string.split('【二次元壁纸分享】')[-1][:3]
    return name

def GetLinks(bsObj):
    links = bsObj.findAll("li",{"class": "album__list-item js_album_item js_wx_tap_highlight wx_tap_cell"})
    link_list = []
    for link in links:
        if link.attrs['data-link'] is not None:
            if link.attrs['data-link'] not in link_list:
                link_list.append(link.attrs['data-link'])
    return link_list

def DownloadApart(images,name):
    global n
    i = 0
    for image in images:
        i += 1
        n += 1
        print("finish！！！, now you get " + str(n) + " images")
        response = image.attrs["data-src"]
        num = name + "-" + str(i)
        dtype = response.split('=')[-1]
        num += '.' + dtype
        response = req.get(response)
        os.makedirs(name, exist_ok=True)
        with open(name+ '/' +num, 'wb')as f:
            f.write(response.content)

def DownloadTogether(images,name):
    global n
    i = 0
    for image in images:
        i += 1
        n += 1
        print("finish！！！, now you get " + str(n) + " images")
        response = image.attrs["data-src"]
        num = name + "-" + str(i)
        dtype = response.split('=')[-1]
        num += '.' + dtype
        response = req.get(response)
        with open(num, 'wb')as f:
            f.write(response.content)

print("Do you want to Together or Apart?")
choice = input("Please input 't' for Together 'a' for Apart\n")

link_list = GetAllURL()
for link in link_list:
    bsObj = GetNextURL(link)
    images = Getimage(bsObj)
    name = GetName(bsObj)

    if choice == "t":
        DownloadTogether(images,name)
    elif choice == "a":
        DownloadApart(images,name)
    else:
        print("Wrong choice~~~~")

print("All images finish!!!!!")

一口气干了 181 张，可以可以……

更新日志：

version：v1.2
date：2022.4.2
type：
- Features：可以选择把图片分开进行存储或者统一存储
- Changed：改变了设计思路，点开即使用，不需要过多的操作
- Removed：删除了大量的控制操作
desc：
- v1.1 的变种，旨在快速获取大量图片，减少操作频率，但程序也因此不可控制，会重复Download相同的图片
- 有时需要手动调整输出文件的目录

爬虫小子_V1.3

桌宠特供版，对代码进行了优化，并把函数用类组织起来

from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests as req
import os

class Crawler():
    def __init__(self):
        self.stop = 0
        self.n = 0

    def GetNextURL(self,target):
        self.html = urlopen(target)
        self.bsObj = BeautifulSoup(self.html, "html.parser")
        self.name = self.GetName(self.bsObj)
        if self.name == None:
            print("Something wrong...")
        else:
            print("The next one is " + self.name)
        return self.bsObj

    def GetAllURL(self):
        #self.target = "https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MzUzNjA0MjkxMw==&action=getalbum&album_id=2119868775733706753&scene=173&from_msgid=2247561088&from_itemidx=1&count=3&nolastread=1#wechat_redirect"
        self.target = "https://mp.weixin.qq.com/mp/appmsgalbum?__biz=MzUzNjA0MjkxMw==&action=getalbum&album_id=2342983337092759553&scene=173&from_msgid=2247561474&from_itemidx=1&count=3&nolastread=1#wechat_redirect"
        self.html = urlopen(self.target)
        self.bsObj = BeautifulSoup(self.html, "html.parser")
        self.link_list = self.GetLinks(self.bsObj)
        return self.link_list

    def Getimage(self,bsObj):
        self.images = bsObj.findAll("img", {"class": "rich_pages wxw-img"})
        return self.images

    def GetName(self,bsObj):
        self.title = bsObj.find("h1", {"class": "rich_media_title"})
        self.name = self.title.string.split('【二次元动漫壁纸】')[-1][:3]
        if self.name.isdigit():
            return "image-" + self.name
        else:
            self.ame = self.title.string.split('【二次元壁纸分享】')[-1][:3]
            if self.name.isdigit():
                return "image-" + self.name
            else:
                return None

    def GetLinks(self,bsObj):
        self.links = bsObj.findAll("li",{"class": "album__list-item js_album_item js_wx_tap_highlight wx_tap_cell"})
        self.link_list = []
        for link in self.links:
            if link.attrs['data-link'] is not None:
                if link.attrs['data-link'] not in self.link_list:
                    self.link_list.append(link.attrs['data-link'])
        return self.link_list

    def DownloadTogether(self,images,name):
        self.i = 0
        for image in images:
            self.i += 1
            self.n += 1
            self.response = image.attrs["data-src"]
            self. num = name + "-" + str(self.i)
            self.dtype = self.response.split('=')[-1]
            self.num += '.' + self.dtype
            self.response = req.get(self.response)
            print("finish！！！, now you get " + str(self.n) + " => "+self.num)
            self.path = os.path.join("D:\\PythonProject\\Images",self.num)
            with open(self.path, 'wb')as f:
                f.write(self.response.content)

    def Start(self):
        self.link_list = self.GetAllURL()
        for link in self.link_list:
            if self.stop == 1:
                print("OK quit....")
                return None
            self.bsObj = self.GetNextURL(link)
            self.images = self.Getimage(self.bsObj)
            self.name = self.GetName(self.bsObj)
            if self.name != None:
                self.DownloadTogether(self.images,self.name)

    def Stop(self):
        self.stop = 1

if __name__=="__main__":
    crawler = Crawler()
    crawler.Start()

更新日志：

version：v1.3
date：2022.5.14
type：
- Features：
  - 用类对函数进行了组织
  - 添加了 Stop 功能
  - 修了一些 BUG
- Changed：
  - 固定了文件保存的路径（用绝对路径写死）
- Removed：
  - 删除了分离储存，现在只能进行统一存储
desc：
- 这款爬虫是为桌宠准备的，在功能上几乎没有提升（甚至还有阉割）