diff --git a/experiments/duckdb_census_of_population_lonboard.ipynb b/experiments/duckdb_census_of_population_lonboard.ipynb index e500e01..40dd39c 100644 --- a/experiments/duckdb_census_of_population_lonboard.ipynb +++ b/experiments/duckdb_census_of_population_lonboard.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 12, + "execution_count": 1, "id": "56ac906e", "metadata": {}, "outputs": [], @@ -25,28 +25,32 @@ "id": "5d97e882", "metadata": {}, "source": [ - "# 1.0 Total private dwellings and private dwellings per square kilometer for Ottawa\n", + "# 1. Total private dwellings and private dwellings per square kilometer at Dissemination Area geographic level\n", "These values are from the 2021 Census of Population" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 2, "id": "580c82ad-f64d-439f-9055-2307fdf7cccd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 26, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "\"\"\"\n", + "Vancouver CMA is geo.cma_dguid = '2021S0503933'\n", + "\"\"\"\n", + "\n", "con.execute(\"\"\"\n", "DROP TABLE IF EXISTS geo_data;\n", "CREATE TABLE geo_data AS\n", @@ -56,45 +60,43 @@ " cop.count_total_4,\n", " cop.count_total_6,\n", " cop.count_total_7,\n", - " ROUND(\n", - " (cop.count_total_4 / cop.count_total_7), 2\n", - " ) AS count_total_4_per_square_km,\n", + " CASE\n", + " WHEN cop.count_total_7 = 0.0 THEN 0\n", + " WHEN cop.count_total_4 = 0 THEN 0\n", + " WHEN cop.count_total_4 IS NULL THEN 0\n", + " ELSE \n", + " ROUND(\n", + " (cop.count_total_4 / cop.count_total_7), 2\n", + " ) \n", + " END AS count_total_4_per_square_km,\n", " geo.geom\n", "FROM\n", " 'https://data.dataforcanada.org/processed/statistics_canada/census_of_population/2021/tabular/da_2021.parquet' AS cop,\n", " 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/da_2021.parquet' AS geo\n", - "WHERE geo.csd_name IN ('Vancouver') AND cop.da_dguid = geo.da_dguid;\n", + "WHERE geo.csd_name = 'Vancouver' AND cop.da_dguid = geo.da_dguid;\n", "\n", "\"\"\")\n", "\n", "con.execute(\"\"\"\n", - "COPY geo_data TO './da_2021_private_dwellings.parquet' (FORMAT PARQUET);\n", + "COPY geo_data TO './da_2021_characteristic.parquet' (FORMAT PARQUET);\n", "\"\"\")" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 3, "id": "e4794c4d-6013-40b5-8e59-046fc2495d34", "metadata": {}, "outputs": [], "source": [ - "private_dwellings_per_square_km = con.execute(\"SELECT DISTINCT count_total_4_per_square_km FROM geo_data\").fetchall()\n", + "characteristic_values = con.execute(\"SELECT DISTINCT count_total_4_per_square_km FROM geo_data\").fetchall()\n", "\n", - "values = np.array([v[0] for v in private_dwellings_per_square_km])\n", + "values = np.array([v[0] for v in characteristic_values])\n", "\n", "# Compute Jenks breaks\n", "num_classes = 5\n", - "breaks = jenkspy.jenks_breaks(values, n_classes=num_classes)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "8672f3f8-82bf-439e-8558-cb3566f2062f", - "metadata": {}, - "outputs": [], - "source": [ + "breaks = jenkspy.jenks_breaks(values, n_classes=num_classes)\n", + "\n", "# Create a bin range mapping: (lower, upper) for each bin\n", "bin_ranges = [(breaks[i], breaks[i+1]) for i in range(len(breaks)-1)]\n", "\n", @@ -102,22 +104,14 @@ "def jenks_range(value) -> str:\n", " for i, (low, high) in enumerate(bin_ranges):\n", " if low <= value <= high:\n", - " return f\"{int(low)}–{int(high)}\"\n", + " return f\"{int(low)}-{int(high)}\"\n", " return \"unknown\"\n", "\n", "\n", - "dwellings_df = gpd.read_parquet('./da_2021_private_dwellings.parquet')\n", - "dwellings_df['category'] = dwellings_df[\"count_total_4_per_square_km\"].apply(lambda v: jenks_range(v))\n", - "dwellings_df['category'] = dwellings_df['category'].astype('category')" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "f265300a-9cf7-4ab7-8bdb-d66feae3a2f8", - "metadata": {}, - "outputs": [], - "source": [ + "characteristic_df = gpd.read_parquet('./da_2021_characteristic.parquet')\n", + "characteristic_df['category'] = characteristic_df[\"count_total_4_per_square_km\"].apply(lambda v: jenks_range(v))\n", + "characteristic_df['category'] = characteristic_df['category'].astype('category')\n", + "\n", "# Categories to colors\n", "cmap = {}\n", "colors = [\n", @@ -127,13 +121,13 @@ " [255, 63.75, 63.75],\n", " [255, 0, 0]\n", "]\n", - "for index, value in enumerate(dwellings_df['category'].unique()):\n", + "for index, value in enumerate(sorted(characteristic_df['category'].unique(), key=lambda x: int(x.split('-')[0]))):\n", " cmap[value] = colors[index]" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 17, "id": "a6a2ae6c-61b7-4c0e-bbe7-a580a511ee5a", "metadata": {}, "outputs": [], @@ -152,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 22, "id": "56e96627-0e82-436a-bd8e-e51546c7526b", "metadata": {}, "outputs": [], @@ -169,53 +163,14 @@ }, { "cell_type": "code", - "execution_count": 35, - "id": "fef53303-147d-44af-b8f4-b824f64b486f", - "metadata": {}, - "outputs": [], - "source": [ - "get_color = apply_categorical_cmap(pa.array(dwellings_df['category']), cmap)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 36, + "execution_count": 6, "id": "6935a061-41fc-4223-b155-4caf4c6df103", "metadata": {}, - "outputs": [], - "source": [ - "cop_layer = PolygonLayer.from_geopandas(gdf=dwellings_df,\n", - " stroked=True,\n", - " get_fill_color=get_color,\n", - " get_line_color=[255, 255, 255],\n", - " get_line_width=5,\n", - " line_width_units=\"meters\",\n", - " opacity=0.4,\n", - " auto_highlight = True,\n", - " highlight_color=[0,0,0,0]\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "1ab3dded-cff9-40cb-ac32-306581018083", - "metadata": {}, - "outputs": [], - "source": [ - "m = Map([basemap, cop_layer])" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "afb530c1-f652-4aaa-a3fc-1214a71ffdce", - "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5d5e30381eff4f839a04ae81eff1b1e2", + "model_id": "0bf8da87ca7045eb9394ed83a01c2857", "version_major": 2, "version_minor": 1 }, @@ -223,14 +178,184 @@ "Map(custom_attribution='', layers=(BitmapTileLayer(data='http://mt0.google.com/vt/lyrs=s&hl=en&x={x}&y={y}&z={…" ] }, - "execution_count": 38, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "get_color = apply_categorical_cmap(pa.array(characteristic_df['category']), cmap)\n", + "\n", + "cop_layer = PolygonLayer.from_geopandas(gdf=characteristic_df,\n", + " stroked=True,\n", + " get_fill_color=get_color,\n", + " get_line_color=[255, 255, 255],\n", + " get_line_width=5,\n", + " line_width_min_pixels=0.2,\n", + " line_width_units=\"meters\",\n", + " opacity=0.4,\n", + " auto_highlight = True\n", + " )\n", + "\n", + "m = Map([basemap, cop_layer])\n", + "\n", "m" ] + }, + { + "cell_type": "markdown", + "id": "2e87d9fa-50d0-4278-99b3-7399b88aa010", + "metadata": {}, + "source": [ + "# 2. Percentage of people with income $100,000 and over\n", + "These values are from the 2021 Census of Population" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "3fcf04bc-2c8b-4e76-9c6d-7d1eb3892dbc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "con.execute(\"\"\"\n", + "DROP TABLE IF EXISTS geo_data_2;\n", + "CREATE TABLE geo_data_2 AS\n", + "SELECT\n", + " geo.da_dguid,\n", + " cop.count_total_1,\n", + " cop.count_total_155,\n", + " cop.count_total_168,\n", + " CASE\n", + " WHEN cop.count_total_168 = 0.0 THEN 0\n", + " WHEN cop.count_total_155 = 0 THEN 0\n", + " WHEN cop.count_total_168 IS NULL THEN 0\n", + " WHEN cop.count_total_155 IS NULL THEN 0\n", + " ELSE \n", + " ((cop.count_total_168/cop.count_total_155) * 100) \n", + " END AS percentage_over_100k,\n", + " geo.geom\n", + "FROM\n", + " 'https://data.dataforcanada.org/processed/statistics_canada/census_of_population/2021/tabular/da_2021.parquet' AS cop,\n", + " 'https://data.dataforcanada.org/processed/statistics_canada/boundaries/2021/digital_boundary_files/da_2021.parquet' AS geo\n", + "WHERE geo.cma_dguid = '2021S0503933' AND cop.da_dguid = geo.da_dguid;\n", + "\"\"\")\n", + "\n", + "con.execute(\"\"\"\n", + "COPY geo_data_2 TO './da_2021_characteristic_2.parquet' (FORMAT PARQUET);\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "b730c891-d3b6-4fb4-a9ea-dd898e9e8490", + "metadata": {}, + "outputs": [], + "source": [ + "characteristic_values = con.execute(\"SELECT DISTINCT percentage_over_100k FROM geo_data_2\").fetchall()\n", + "\n", + "values = np.array([v[0] for v in characteristic_values])\n", + "\n", + "# Compute Jenks breaks\n", + "num_classes = 5\n", + "breaks = jenkspy.jenks_breaks(values, n_classes=num_classes)\n", + "\n", + "# Create a bin range mapping: (lower, upper) for each bin\n", + "bin_ranges = [(breaks[i], breaks[i+1]) for i in range(len(breaks)-1)]\n", + "\n", + "# Create a function to get the range string for a value\n", + "def jenks_range(value) -> str:\n", + " for i, (low, high) in enumerate(bin_ranges):\n", + " if low <= value <= high:\n", + " return f\"{int(low)}-{int(high)}\"\n", + " return \"unknown\"\n", + "\n", + "\n", + "characteristic_df = gpd.read_parquet('./da_2021_characteristic_2.parquet')\n", + "characteristic_df['category'] = characteristic_df[\"percentage_over_100k\"].apply(lambda v: jenks_range(v))\n", + "characteristic_df['category'] = characteristic_df['category'].astype('category')\n", + "\n", + "# Categories to colors\n", + "cmap = {}\n", + "colors = [\n", + " [255, 255, 255],\n", + " [255, 191.25, 191.25],\n", + " [255, 127.50, 127.50],\n", + " [255, 63.75, 63.75],\n", + " [255, 0, 0]\n", + "]\n", + "for index, value in enumerate(sorted(characteristic_df['category'].unique(), key=lambda x: int(x.split('-')[0]))):\n", + " cmap[value] = colors[index]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "8f766c5a-5d6d-490e-b082-6b6efe399409", + "metadata": {}, + "outputs": [], + "source": [ + "get_color = apply_categorical_cmap(pa.array(characteristic_df['category']), cmap)\n", + "\n", + "cop_layer = PolygonLayer.from_geopandas(gdf=characteristic_df,\n", + " stroked=True,\n", + " get_fill_color=get_color,\n", + " get_line_color=[255, 255, 255],\n", + " get_line_width=5,\n", + " line_width_min_pixels=0.2,\n", + " line_width_units=\"meters\",\n", + " opacity=0.4,\n", + " auto_highlight = True\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "85c8c731-538e-440a-b784-125968222b7c", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "91c9a47b91814bd28c7b5c0a10557973", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "Map(custom_attribution='', layers=(BitmapTileLayer(data='http://mt0.google.com/vt/lyrs=s&hl=en&x={x}&y={y}&z={…" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m = Map([basemap, cop_layer])\n", + "\n", + "m" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e09e4973-9018-4065-b9f9-d4259019bcf5", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {