Howto:Processing d-tpp using Python: Difference between revisions

Howto:Processing d-tpp using Python (view source)

1,191 bytes added , 28 November 2017

20,741

edits

@@ Line 30: / Line 30: @@
 === Scraping ===
 <syntaxhighlight lang="python">
+import os
+import urlparse
+import scrapy
+from scrapy.crawler import CrawlerProcess
+from scrapy.http import Request
+ITEM_PIPELINES = {'scrapy.pipelines.files.FilesPipeline': 1}
+def createFolder(directory):
+    try:
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+    except OSError:
+        print ('Error: Creating directory. ' +  directory)
+class dTPPSpider(scrapy.Spider):
+    name = "pwc_tax"
+    allowed_domains = ["155.178.201.160"]
+    start_urls = ["http://155.178.201.160/d-tpp/1712/"]
+    def parse(self, response):
+        for href in response.css('a::attr(href)').extract():
+            yield Request(
+                url=response.urljoin(href),
+                callback=self.save_pdf
+            )
+    def save_pdf(self, response):
+        path = response.url.split('/')[-1]
+        self.logger.info('Saving PDF %s', path)
+        with open('./PDF/'+path, 'wb') as f:
+            f.write(response.body)
+process = CrawlerProcess({
+    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
+})
+createFolder('./PDF/')
+process.crawl(dTPPSpider)
+process.start() # the script will block here until the crawling is finished
 </syntaxhighlight>