20,741
edits
No edit summary |
|||
Line 59: | Line 59: | ||
print ('Error: Creating directory. ' + directory) | print ('Error: Creating directory. ' + directory) | ||
class dTPPSpider(scrapy.Spider): | class dTPPSpider(scrapy.Spider): | ||
name = | name = 'dTPPSpider' | ||
# https://doc.scrapy.org/en/latest/topics/settings.html | |||
custom_settings = { | |||
'HTTPCACHE_ENABLED': True, | |||
'HTTPCACHE_STORAGE': 'scrapy.extensions.httpcache.FilesystemCacheStorage', | |||
'HTTPCACHE_POLICY': 'scrapy.extensions.httpcache.RFC2616Policy' | |||
} | |||
allowed_domains = ["155.178.201.160"] | allowed_domains = ["155.178.201.160"] | ||
start_urls = ["http://155.178.201.160/d-tpp/1712/"] | |||
start_urls = [ "http://155.178.201.160/d-tpp/1712/", | |||
"http://155.178.201.160/d-tpp/1713/"] | |||
def parse(self, response): | def parse(self, response): | ||
Line 75: | Line 82: | ||
def save_pdf(self, response): | def save_pdf(self, response): | ||
directory = './PDF/' | |||
path = response.url.split('/')[-1] | path = response.url.split('/')[-1] | ||
self.logger.info('Saving PDF %s', path) | cycle = response.url.split('/')[-2] | ||
with open(' | createFolder(directory) | ||
createFolder(directory+cycle) | |||
# TODO: split folder (AIRAC cycle) | |||
self.logger.info('Saving PDF %s (cycle:%s)', path, cycle) | |||
with open(directory + '/'+cycle+'/' + path, 'wb') as f: | |||
f.write(response.body) | f.write(response.body) | ||
Line 86: | Line 98: | ||
process.crawl(dTPPSpider) | process.crawl(dTPPSpider) | ||
process.start() # the script will block here until the crawling is finished | process.start() # the script will block here until the crawling is finished |