Howto:Processing d-tpp using Python: Difference between revisions

Jump to navigation Jump to search
Line 30: Line 30:
=== Scraping ===
=== Scraping ===
<syntaxhighlight lang="python">
<syntaxhighlight lang="python">
import os
import urlparse
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.http import Request
ITEM_PIPELINES = {'scrapy.pipelines.files.FilesPipeline': 1}
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)
       
class dTPPSpider(scrapy.Spider):
    name = "pwc_tax"
    allowed_domains = ["155.178.201.160"]
    start_urls = ["http://155.178.201.160/d-tpp/1712/"]
    def parse(self, response):
        for href in response.css('a::attr(href)').extract():
            yield Request(
                url=response.urljoin(href),
                callback=self.save_pdf
            )
    def save_pdf(self, response):
        path = response.url.split('/')[-1]
        self.logger.info('Saving PDF %s', path)
        with open('./PDF/'+path, 'wb') as f:
            f.write(response.body)
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
createFolder('./PDF/')
process.crawl(dTPPSpider)
process.start() # the script will block here until the crawling is finished
</syntaxhighlight>
</syntaxhighlight>


Navigation menu