scrape图片爬虫

Scrapy爬虫项目:图片下载器

1. 创建爬虫

使用命令创建爬虫模板:

1
genspider -t crawl

2. 代码实现

2.1 主爬虫文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from selenium import webdriver
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ..items import PicItem # 根据你的项目结构调整导入路径

class MirrowSpider(CrawlSpider):
name = "mirrow"
# start_urls = ["https://www.vilipix.com/p"]#p站镜像站
allowed_domains = ["dimtown.com"]
start_urls = ["https://dimtown.com/cosplay/page/1"]#次元小镇

rules = (
# Rule(LinkExtractor(allow=r'/p/[0-9a-f]{32}'), callback='parse_item', follow=True),#p站镜像站
Rule(LinkExtractor(allow=r'/cosplay/page/[1-2]'), follow=True),
Rule(LinkExtractor(allow=r'/\d+\.html'), callback='parse_item', follow=False),#次元小镇
)

def __init__(self, *args, **kwargs):
super(MirrowSpider, self).__init__(*args, **kwargs)
self.driver = webdriver.Chrome()

def parse_item(self, response):
self.driver.get(response.url)
# 打印当前使用的 User-Agent
user_agent = response.request.headers.get('User-Agent').decode('utf-8')
self.logger.info(f"当前使用的 User-Agent: {user_agent}")

self.driver.get(response.url)
wait = WebDriverWait(self.driver, 10) # 增加等待时间
try:
# Wait until the title element is visible and get its text
title_element = WebDriverWait(self.driver, 10).until(
# EC.visibility_of_element_located((By.XPATH, '//h1[@class="title"]'))
EC.visibility_of_element_located((By.XPATH, '//h1'))
)
title = title_element.text.strip()
self.logger.info("标题:%s", title)

# Get the URL of the first image in the illustrations section
#p站镜像img_url_elements = self.driver.find_elements(By.XPATH, '//div[@class="illust-item"]/a/img')
img_url_elements = self.driver.find_elements(By.XPATH, '//img[@decoding="async"]')

# Initialize a list to store all image URLs
img_urls = []

# Iterate over each image element to get its src attribute
for img_url_element in img_url_elements:
img_url = img_url_element.get_attribute('src')
img_urls.append(img_url)

self.logger.info("所有图片网址:%s", img_urls)

# Create a PicItem instance with image_urls and title
item = PicItem(image_urls=img_urls, title=title)

# Yield the PicItem instance for further processing
yield item

except Exception as e:
self.logger.error("An error occurred: %s", e)

def closed(self, reason):
# Close the WebDriver when spider is closed
self.driver.quit()

2.2 Pipeline文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import os
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from urllib.parse import urlparse

class CustomImagePipeline(ImagesPipeline):

def get_media_requests(self, item, info):
for image_url in item.get('image_urls', []):
yield scrapy.Request(image_url, meta={'item': item})

def file_path(self, request, response=None, info=None, *, item=None):
item = request.meta['item']
title = item.get('title', 'default_title').replace(' ', '_')
parsed_url = urlparse(request.url)
image_name = os.path.basename(parsed_url.path)
return f'{title}/{image_name}'

def item_completed(self, results, item, info):
if not results:
return item

image_paths = [x['path'] for ok, x in results if ok]
item['images'] = image_paths

return item

2.3 Middleware文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
from fake_useragent import UserAgent

class CustomRedirectMiddleware(RedirectMiddleware):
def _redirect(self, redirected, request, spider, reason):
redirected = redirected.replace(url=request.url)
return super()._redirect(redirected, request, spider, reason)

class RandomUserAgentMiddleware(object):
def __init__(self):
self.ua = UserAgent()

def process_request(self, request, spider):
user_agent = self.ua.random
request.headers['User-Agent'] = user_agent
request.headers['Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
request.headers['Accept-Language'] = "en"
request.headers['Referer'] = 'https://dimtown.com/cosplay/page/1'

2.4 Settings文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
BOT_NAME = "pic"

SPIDER_MODULES = ["pic.spiders"]
NEWSPIDER_MODULE = "pic.spiders"

ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3

import logging
LOG_LEVEL = "INFO"

DEFAULT_REQUEST_HEADERS = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en",
'Referer': 'https://dimtown.com/cosplay/page/1',
}

DOWNLOADER_MIDDLEWARES = {
'pic.middlewares.CustomRedirectMiddleware': 600,
'pic.middlewares.RandomUserAgentMiddleware': 543,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
}

ITEM_PIPELINES = {
'pic.pipelines.CustomImagePipeline': 300,
}

IMAGES_STORE='COSPLAY'

REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

# 重试设置
RETRY_ENABLED = True
RETRY_TIMES = 5
RETRY_HTTP_CODES = [429, 500, 502, 503, 504, 522, 524, 408]
RETRY_DELAY = 10

# 自动节流设置
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_MAX_DELAY = 60
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
AUTOTHROTTLE_DEBUG = False

3. 运行效果

运行效果1
运行效果2