2015-12-21 3 views
0

У меня есть такой код:Запуск задач Scrapy в цикле в питоне

from logging import INFO 

    import scrapy 

    class LinkedInAnonymousSpider(scrapy.Spider): 
     name = "linkedin_anonymous" 
     allowed_domains = ["linkedin.com"] 
     start_urls = [] 

     base_url = "https://www.linkedin.com/pub/dir/?first=%s&last=%s&search=Search" 

     def __init__(self, input=None, first=None, last=None): 
      self.input = input # source file name 
      self.first = first 
      self.last = last 

     def start_requests(self): 
      if self.first and self.last: # taking input from command line parameters 
       url = self.base_url % (self.first, self.last) 
       yield self.make_requests_from_url(url) 
      elif self.input: # taking input from file 
       i = 0 
       self.log('Input from file: %s' % self.input, INFO) 
       for line in open(self.input, 'r').readlines(): 
        i += 1 
        if line.strip(): # no blank line 
         t = line.split("\t") 
         name = t[0] 
         parts = [n.strip() for n in name.split(' ')] 
         last = parts.pop() 
         first = " ".join(parts) 

         if first and last: 
          url = self.base_url % (first, last) 
          yield self.make_requests_from_url(url) 
      else: 
       raise Exception('No input.') 

     def parse(self, response): 
      # if there is exactly one match the person's profile page is returned 
      if response.xpath('//div[@class="profile-overview-content"]').extract(): 
       yield scrapy.Request(response.url, callback=self.parse_full_profile_page) 
      else: 
       # extracting profile urls from search result 
       for sel in response.css('div.profile-card'): 
        url = sel.xpath('./*/h3/a/@href').extract()[0] # Person's full profile URL in LinkedIn 
        yield scrapy.Request(url, callback=self.parse_full_profile_page) 
........ 

С помощью этого кода я получаю данные профиля из списка людей из Linkedin.

Я написал такую ​​основную функцию, чтобы сделать это.

import scrapy 
import sys 

from linkedin_anonymous_spider import LinkedInAnonymousSpider 
from scrapy.crawler import CrawlerProcess 
from scrapy.utils.project import get_project_settings 
from twisted.internet import reactor 

if __name__ == "__main__": 
     firstname = ['Hasan', 'James'] 
     lastname = ['Arslan', 'Bond'] 
     for a in range(len(firstname)): 
       settings = get_project_settings() 
       crawler = CrawlerProcess(settings) 
       spider = LinkedInAnonymousSpider() 
       crawler.crawl(spider, [], firstname[a], lastname[a]) 
       crawler.start() 

Когда цикл доходит до стадии 2, я получаю такую ​​ошибку:

рейз error.ReactorNotRestartable() twisted.internet.error.ReactorNotRestartable

Как я могу исправить проблема?

Благодаря

ответ

1

Вы можете работать только один реактор, так просто звоню crawler.start() один раз.

Пройти мимо crawler.start() из петли.

1

Вот правильный вариант:

firstname = ['Hasan', 'James'] 
lastname = ['Arslan', 'Bond'] 
settings = get_project_settings() 
crawler = CrawlerProcess(settings) 

for a in range(len(firstname)): 
    crawler.crawl(LinkedInAnonymousSpider, [], firstname[a], lastname[a]) 

crawler.start()