У меня есть такой код:Запуск задач Scrapy в цикле в питоне
from logging import INFO
import scrapy
class LinkedInAnonymousSpider(scrapy.Spider):
name = "linkedin_anonymous"
allowed_domains = ["linkedin.com"]
start_urls = []
base_url = "https://www.linkedin.com/pub/dir/?first=%s&last=%s&search=Search"
def __init__(self, input=None, first=None, last=None):
self.input = input # source file name
self.first = first
self.last = last
def start_requests(self):
if self.first and self.last: # taking input from command line parameters
url = self.base_url % (self.first, self.last)
yield self.make_requests_from_url(url)
elif self.input: # taking input from file
i = 0
self.log('Input from file: %s' % self.input, INFO)
for line in open(self.input, 'r').readlines():
i += 1
if line.strip(): # no blank line
t = line.split("\t")
name = t[0]
parts = [n.strip() for n in name.split(' ')]
last = parts.pop()
first = " ".join(parts)
if first and last:
url = self.base_url % (first, last)
yield self.make_requests_from_url(url)
else:
raise Exception('No input.')
def parse(self, response):
# if there is exactly one match the person's profile page is returned
if response.xpath('//div[@class="profile-overview-content"]').extract():
yield scrapy.Request(response.url, callback=self.parse_full_profile_page)
else:
# extracting profile urls from search result
for sel in response.css('div.profile-card'):
url = sel.xpath('./*/h3/a/@href').extract()[0] # Person's full profile URL in LinkedIn
yield scrapy.Request(url, callback=self.parse_full_profile_page)
........
С помощью этого кода я получаю данные профиля из списка людей из Linkedin.
Я написал такую основную функцию, чтобы сделать это.
import scrapy
import sys
from linkedin_anonymous_spider import LinkedInAnonymousSpider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
if __name__ == "__main__":
firstname = ['Hasan', 'James']
lastname = ['Arslan', 'Bond']
for a in range(len(firstname)):
settings = get_project_settings()
crawler = CrawlerProcess(settings)
spider = LinkedInAnonymousSpider()
crawler.crawl(spider, [], firstname[a], lastname[a])
crawler.start()
Когда цикл доходит до стадии 2, я получаю такую ошибку:
рейз error.ReactorNotRestartable() twisted.internet.error.ReactorNotRestartable
Как я могу исправить проблема?
Благодаря