1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
| import scrapy from scrapy.spiders import Spider from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from pydispatch import dispatcher from scrapy import signals from ..items import PicItem import time class MirrowSpider(Spider): name = "moving_pic" allowed_domains = ["dimtown.com"] start_urls = ["https://dimtown.com/jxmt"] title_count = 0 title_limit = 200
def __init__(self, *args, **kwargs): super(MirrowSpider, self).__init__(*args, **kwargs) self.driver = webdriver.Chrome() dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider): self.driver.quit()
def parse(self, response): user_agent = response.request.headers.get('User-Agent').decode('utf-8') self.logger.info(f"当前使用的 User-Agent: {user_agent}")
self.driver.get(response.url) wait = WebDriverWait(self.driver, 20)
try: comments_link = wait.until(EC.element_to_be_clickable((By.XPATH, '//a[@data-orderby="comment_count"]'))) self.logger.info("找到'评论最多'链接并点击") comments_link.click()
last_height = self.driver.execute_script("return document.body.scrollHeight")
while True: self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") self.logger.info("页面向下滚动") time.sleep(5) new_height = self.driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height
links = self.driver.find_elements(By.XPATH, '//a[contains(@href, ".html")]') detail_urls = [link.get_attribute('href') for link in links] self.logger.info("所有详情页链接:%s", detail_urls)
for url in detail_urls: yield scrapy.Request(url, callback=self.parse_detail)
except Exception as e: self.logger.error("在解析初始页面时发生错误:%s", e)
def parse_detail(self, response): user_agent = response.request.headers.get('User-Agent').decode('utf-8') self.logger.info(f"当前使用的 User-Agent: {user_agent}")
self.driver.get(response.url) wait = WebDriverWait(self.driver, 20)
try: title = wait.until(EC.visibility_of_element_located((By.XPATH, '//h1'))).text.strip() self.logger.info("标题:%s", title)
img_urls = [img.get_attribute('src') for img in self.driver.find_elements(By.XPATH, '//img[@decoding="async"]')] self.logger.info("所有图片网址:%s", img_urls)
if not img_urls: self.logger.warning("未找到任何图片网址。")
self.title_count += 1 if self.title_count >= self.title_limit: self.crawler.engine.close_spider(self, '达到标题限制,爬虫停止')
item = PicItem(image_urls=img_urls, title=title) yield item
except Exception as e: self.logger.error("解析详情页时发生错误:%s", e)
|