20,741
edits
Line 30: | Line 30: | ||
=== Scraping === | === Scraping === | ||
<syntaxhighlight lang="python"> | <syntaxhighlight lang="python"> | ||
import os | |||
import urlparse | |||
import scrapy | |||
from scrapy.crawler import CrawlerProcess | |||
from scrapy.http import Request | |||
ITEM_PIPELINES = {'scrapy.pipelines.files.FilesPipeline': 1} | |||
def createFolder(directory): | |||
try: | |||
if not os.path.exists(directory): | |||
os.makedirs(directory) | |||
except OSError: | |||
print ('Error: Creating directory. ' + directory) | |||
class dTPPSpider(scrapy.Spider): | |||
name = "pwc_tax" | |||
allowed_domains = ["155.178.201.160"] | |||
start_urls = ["http://155.178.201.160/d-tpp/1712/"] | |||
def parse(self, response): | |||
for href in response.css('a::attr(href)').extract(): | |||
yield Request( | |||
url=response.urljoin(href), | |||
callback=self.save_pdf | |||
) | |||
def save_pdf(self, response): | |||
path = response.url.split('/')[-1] | |||
self.logger.info('Saving PDF %s', path) | |||
with open('./PDF/'+path, 'wb') as f: | |||
f.write(response.body) | |||
process = CrawlerProcess({ | |||
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' | |||
}) | |||
createFolder('./PDF/') | |||
process.crawl(dTPPSpider) | |||
process.start() # the script will block here until the crawling is finished | |||
</syntaxhighlight> | </syntaxhighlight> | ||