mirror of
https://github.com/dataforcanada/d4c-datapkg-statistical.git
synced 2026-06-15 07:00:56 +02:00
Scraping the table names from from https://www150.statcan.gc.ca/n1/en/type/data
Will compare against the productIds available at https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite
This commit is contained in:
@@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
||||
@@ -0,0 +1,26 @@
|
||||
import scrapy
|
||||
import re
|
||||
|
||||
|
||||
class StatCanDataTablesSpider(scrapy.Spider):
|
||||
name = "statcan_tables"
|
||||
allowed_domains = ["www150.statcan.gc.ca"]
|
||||
start_urls = [
|
||||
"https://www150.statcan.gc.ca/n1/en/type/data?count=100#tables"
|
||||
]
|
||||
|
||||
def parse(self, response):
|
||||
# Use regex to match any content after "Table:", stopping at a tag or line break
|
||||
# This finds patterns like: "Table: 12-10-0134-01", "Table: 45-67-8910-02", etc.
|
||||
matches = re.findall(r'\<span\>Table:\<\/span\>\s*([^<\n\r]+)', response.text)
|
||||
|
||||
for table_id in matches:
|
||||
# Clean up whitespace
|
||||
yield {
|
||||
'table_id': table_id.strip()
|
||||
}
|
||||
|
||||
# Follow the "Next" pagination link
|
||||
next_page = response.css('a[rel=next]::attr(href)').get()
|
||||
if next_page:
|
||||
yield response.follow(next_page, callback=self.parse)
|
||||
Reference in New Issue
Block a user