Howto:Processing d-tpp using Python: Difference between revisions

Jump to navigation Jump to search
No edit summary
Line 59: Line 59:
         print ('Error: Creating directory. ' +  directory)
         print ('Error: Creating directory. ' +  directory)
          
          


class dTPPSpider(scrapy.Spider):
class dTPPSpider(scrapy.Spider):
     name = "pwc_tax"
     name = 'dTPPSpider'
    # https://doc.scrapy.org/en/latest/topics/settings.html
    custom_settings = {
'HTTPCACHE_ENABLED': True,
        'HTTPCACHE_STORAGE': 'scrapy.extensions.httpcache.FilesystemCacheStorage',
'HTTPCACHE_POLICY': 'scrapy.extensions.httpcache.RFC2616Policy'   
}


     allowed_domains = ["155.178.201.160"]
     allowed_domains = ["155.178.201.160"]
     start_urls = ["http://155.178.201.160/d-tpp/1712/"]
 
     start_urls = [ "http://155.178.201.160/d-tpp/1712/",
"http://155.178.201.160/d-tpp/1713/"]


     def parse(self, response):
     def parse(self, response):
Line 75: Line 82:


     def save_pdf(self, response):
     def save_pdf(self, response):
directory = './PDF/'
         path = response.url.split('/')[-1]
         path = response.url.split('/')[-1]
         self.logger.info('Saving PDF %s', path)
cycle = response.url.split('/')[-2]
         with open('./PDF/'+path, 'wb') as f:
createFolder(directory)
createFolder(directory+cycle)
# TODO: split folder (AIRAC cycle)
         self.logger.info('Saving PDF %s (cycle:%s)', path, cycle)
         with open(directory + '/'+cycle+'/' + path, 'wb') as f:
             f.write(response.body)
             f.write(response.body)


Line 86: Line 98:




createFolder('./PDF/')
process.crawl(dTPPSpider)
process.crawl(dTPPSpider)
process.start() # the script will block here until the crawling is finished
process.start() # the script will block here until the crawling is finished

Navigation menu