Howto:Processing d-tpp using Python: Difference between revisions

Howto:Processing d-tpp using Python (view source)

478 bytes added , 29 November 2017

20,741

edits

@@ Line 59: / Line 59: @@
          print ('Error: Creating directory. ' +  directory)
 class dTPPSpider(scrapy.Spider):
-     name = "pwc_tax"
+     name = 'dTPPSpider'
+    # https://doc.scrapy.org/en/latest/topics/settings.html
+    custom_settings = {
+	'HTTPCACHE_ENABLED': True,
+        'HTTPCACHE_STORAGE': 'scrapy.extensions.httpcache.FilesystemCacheStorage',
+	'HTTPCACHE_POLICY': 'scrapy.extensions.httpcache.RFC2616Policy'
+	}
      allowed_domains = ["155.178.201.160"]
-     start_urls = ["http://155.178.201.160/d-tpp/1712/"]
+     start_urls = [	"http://155.178.201.160/d-tpp/1712/",
+			"http://155.178.201.160/d-tpp/1713/"]
      def parse(self, response):
@@ Line 75: / Line 82: @@
      def save_pdf(self, response):
+	directory = './PDF/'
          path = response.url.split('/')[-1]
-         self.logger.info('Saving PDF %s', path)
+	cycle = response.url.split('/')[-2]
-         with open('./PDF/'+path, 'wb') as f:
+	createFolder(directory)
+	createFolder(directory+cycle)
+	# TODO: split folder (AIRAC cycle)
+         self.logger.info('Saving PDF %s (cycle:%s)', path, cycle)
+         with open(directory + '/'+cycle+'/' + path, 'wb') as f:
              f.write(response.body)
@@ Line 86: / Line 98: @@
-createFolder('./PDF/')
 process.crawl(dTPPSpider)
 process.start() # the script will block here until the crawling is finished