Scraping the table names from from https://www150.statcan.gc.ca/n1/en/type/data

Will compare against the productIds available at https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesListLite
2026-06-15 07:00:56 +02:00 · 2025-06-27 09:11:42 -04:00
parent b88a2272b4
commit 6ad2e2c4d6
18 changed files with 75040 additions and 0 deletions
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
@@ -0,0 +1,26 @@
+import scrapy
+import re
+
+
+class StatCanDataTablesSpider(scrapy.Spider):
+    name = "statcan_tables"
+    allowed_domains = ["www150.statcan.gc.ca"]
+    start_urls = [
+        "https://www150.statcan.gc.ca/n1/en/type/data?count=100#tables"
+    ]
+
+    def parse(self, response):
+        # Use regex to match any content after "Table:", stopping at a tag or line break
+        # This finds patterns like: "Table: 12-10-0134-01", "Table: 45-67-8910-02", etc.
+        matches = re.findall(r'\<span\>Table:\<\/span\>\s*([^<\n\r]+)', response.text)
+
+        for table_id in matches:
+            # Clean up whitespace
+            yield {
+                'table_id': table_id.strip()
+            }
+
+        # Follow the "Next" pagination link
+        next_page = response.css('a[rel=next]::attr(href)').get()
+        if next_page:
+            yield response.follow(next_page, callback=self.parse)