1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
| from selenium import webdriver import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from ..items import PicItem
class MirrowSpider(CrawlSpider): name = "mirrow" allowed_domains = ["dimtown.com"] start_urls = ["https://dimtown.com/cosplay/page/1"]
rules = ( Rule(LinkExtractor(allow=r'/cosplay/page/[1-2]'), follow=True), Rule(LinkExtractor(allow=r'/\d+\.html'), callback='parse_item', follow=False), )
def __init__(self, *args, **kwargs): super(MirrowSpider, self).__init__(*args, **kwargs) self.driver = webdriver.Chrome()
def parse_item(self, response): self.driver.get(response.url) user_agent = response.request.headers.get('User-Agent').decode('utf-8') self.logger.info(f"当前使用的 User-Agent: {user_agent}")
self.driver.get(response.url) wait = WebDriverWait(self.driver, 10) try: title_element = WebDriverWait(self.driver, 10).until( EC.visibility_of_element_located((By.XPATH, '//h1')) ) title = title_element.text.strip() self.logger.info("标题:%s", title)
img_url_elements = self.driver.find_elements(By.XPATH, '//img[@decoding="async"]')
img_urls = []
for img_url_element in img_url_elements: img_url = img_url_element.get_attribute('src') img_urls.append(img_url)
self.logger.info("所有图片网址:%s", img_urls)
item = PicItem(image_urls=img_urls, title=title)
yield item
except Exception as e: self.logger.error("An error occurred: %s", e)
def closed(self, reason): self.driver.quit()
|