Will compare against the productIds available at  https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite
This commit is contained in:
Diego Ripley
2025-06-27 09:11:42 -04:00
parent b88a2272b4
commit 6ad2e2c4d6
18 changed files with 75040 additions and 0 deletions
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
@@ -0,0 +1,26 @@
import scrapy
import re
class StatCanDataTablesSpider(scrapy.Spider):
name = "statcan_tables"
allowed_domains = ["www150.statcan.gc.ca"]
start_urls = [
"https://www150.statcan.gc.ca/n1/en/type/data?count=100#tables"
]
def parse(self, response):
# Use regex to match any content after "Table:", stopping at a tag or line break
# This finds patterns like: "Table: 12-10-0134-01", "Table: 45-67-8910-02", etc.
matches = re.findall(r'\<span\>Table:\<\/span\>\s*([^<\n\r]+)', response.text)
for table_id in matches:
# Clean up whitespace
yield {
'table_id': table_id.strip()
}
# Follow the "Next" pagination link
next_page = response.css('a[rel=next]::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)