From 92ae582217f303a62d7eef916367a2be9ddbd28a Mon Sep 17 00:00:00 2001
From: "Oh, Sojung" <sojung.oh@studium.uni-hamburg.de>
Date: Fri, 5 Apr 2024 00:01:55 +0000
Subject: [PATCH] Upload New File
---
...arket_4models_RF_GBT_RFE_RF_RFE_GBT_.ipynb | 2463 +++++++++++++++++
1 file changed, 2463 insertions(+)
create mode 100644 Variable Selection/final_Variable_selection_10var_for_weekendmarket_4models_RF_GBT_RFE_RF_RFE_GBT_.ipynb
diff --git a/Variable Selection/final_Variable_selection_10var_for_weekendmarket_4models_RF_GBT_RFE_RF_RFE_GBT_.ipynb b/Variable Selection/final_Variable_selection_10var_for_weekendmarket_4models_RF_GBT_RFE_RF_RFE_GBT_.ipynb
new file mode 100644
index 0000000..a29a930
--- /dev/null
+++ b/Variable Selection/final_Variable_selection_10var_for_weekendmarket_4models_RF_GBT_RFE_RF_RFE_GBT_.ipynb
@@ -0,0 +1,2463 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd \n",
+ "import numpy as np \n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "# Display more columns\n",
+ "pd.set_option('display.max_columns', 68)\n",
+ "#pd.set_option('display.max_rows', 101)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Selected Variables from the prior discussion (including Stadtteilprofile and Tax data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>Anzahl der Einwohner: innen</th>\n",
+ " <th>Anzahl der Kinder und Jugendlichen unter 18 Jahren</th>\n",
+ " <th>Anteil Kinder und Jugendlicher unter 18 Jahren an der Gesamt-bevölkerung</th>\n",
+ " <th>Anzahl älterer Einwohner: innen über 64 Jahren</th>\n",
+ " <th>Anteil älterer Einwohner: innen über 64 Jahren an der Gesamt-bevölkerung</th>\n",
+ " <th>Anzahl der Haushalte</th>\n",
+ " <th>Durch-schnittliche Anzahl der Personen je Haushalt</th>\n",
+ " <th>Fläche in km²</th>\n",
+ " <th>Einwohner: innen je km²</th>\n",
+ " <th>Sozial-versicherungs-pflichtig Beschäftigte am Wohnort</th>\n",
+ " <th>Anteil sozial-versicherungs-pflichtig Beschäftigter am Wohnort an den Erwerbs-fähigen (15 bis unter 65-Jährige)</th>\n",
+ " <th>Anzahl der Arbeitslosen</th>\n",
+ " <th>Anzahl der Wohngebäude</th>\n",
+ " <th>Anzahl der Wohnungen</th>\n",
+ " <th>Durch-schnittliche Wohnungs-größe in m²</th>\n",
+ " <th>Durch-schnittliche Wohnfläche je Einwohner:in in m²</th>\n",
+ " <th>Anzahl der Sozial-wohnungen</th>\n",
+ " <th>Anteil der Sozial-wohnungen an allen Wohnungen</th>\n",
+ " <th>Durch-schnittlicher Immobilien-preis für ein Grundstück in EUR/m²</th>\n",
+ " <th>Durch-schnittlicher Immobilien-preis für eine Eigentums-wohnung in EUR/m²</th>\n",
+ " <th>Anzahl der Wohnungen in Ein- und Zweifamilien-häusern</th>\n",
+ " <th>Anteil der Wohnungen in Ein- und Zweifamilien-häusern an allen Wohnungen</th>\n",
+ " <th>Anzahl der Einpersonen-haushalte</th>\n",
+ " <th>Anteil der Haushalte, in denen nur eine Person lebt, an allen Haushalten</th>\n",
+ " <th>Gesamtbetrag der Einkünfte - [Steuerpflichtig]</th>\n",
+ " <th>Gesamtbetrag der Einkünfte - [1000€]</th>\n",
+ " <th>Festgesetzte Einkommenssteuer/ Jahreslohnsteuer - [1000€]</th>\n",
+ " <th>Gesamtbetrag Einkünfte Mittelwert - [€]</th>\n",
+ " <th>Gesamtbetrag Einkünfte Median - [€]</th>\n",
+ " <th>market_count</th>\n",
+ " <th>farms_count</th>\n",
+ " <th>greencrocers_count</th>\n",
+ " <th>supermarkets_count</th>\n",
+ " <th>biosupermarkets_count</th>\n",
+ " <th>all_restaurants_count</th>\n",
+ " <th>organic_restaurants_count</th>\n",
+ " <th>vegan_restaurants_count</th>\n",
+ " <th>art_score</th>\n",
+ " <th>distance_rathaus</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>stadtteil</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>Hamburg-Altstadt</th>\n",
+ " <td>3182.0</td>\n",
+ " <td>515.0</td>\n",
+ " <td>16.2</td>\n",
+ " <td>316.0</td>\n",
+ " <td>9.9</td>\n",
+ " <td>1884.0</td>\n",
+ " <td>1.7</td>\n",
+ " <td>1.3</td>\n",
+ " <td>2447.0</td>\n",
+ " <td>1346.0</td>\n",
+ " <td>55.6</td>\n",
+ " <td>324.0</td>\n",
+ " <td>103.0</td>\n",
+ " <td>1487.0</td>\n",
+ " <td>74.1</td>\n",
+ " <td>34.6</td>\n",
+ " <td>176.0</td>\n",
+ " <td>11.8</td>\n",
+ " <td>2366.0</td>\n",
+ " <td>4869.0</td>\n",
+ " <td>17.0</td>\n",
+ " <td>1.1</td>\n",
+ " <td>1057.0</td>\n",
+ " <td>56.1</td>\n",
+ " <td>1952.0</td>\n",
+ " <td>61168.0</td>\n",
+ " <td>11577.0</td>\n",
+ " <td>31336.0</td>\n",
+ " <td>10811.0</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>0</td>\n",
+ " <td>129</td>\n",
+ " <td>0</td>\n",
+ " <td>4</td>\n",
+ " <td>2.565476</td>\n",
+ " <td>0.000000</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>HafenCity</th>\n",
+ " <td>6950.0</td>\n",
+ " <td>1386.0</td>\n",
+ " <td>19.9</td>\n",
+ " <td>644.0</td>\n",
+ " <td>9.3</td>\n",
+ " <td>3183.0</td>\n",
+ " <td>2.2</td>\n",
+ " <td>2.4</td>\n",
+ " <td>2865.0</td>\n",
+ " <td>3087.0</td>\n",
+ " <td>61.6</td>\n",
+ " <td>147.0</td>\n",
+ " <td>141.0</td>\n",
+ " <td>3898.0</td>\n",
+ " <td>81.4</td>\n",
+ " <td>45.7</td>\n",
+ " <td>1074.0</td>\n",
+ " <td>27.6</td>\n",
+ " <td>3031.0</td>\n",
+ " <td>10746.0</td>\n",
+ " <td>5.0</td>\n",
+ " <td>0.1</td>\n",
+ " <td>1126.0</td>\n",
+ " <td>35.4</td>\n",
+ " <td>1255.0</td>\n",
+ " <td>116973.0</td>\n",
+ " <td>34051.0</td>\n",
+ " <td>93206.0</td>\n",
+ " <td>57913.0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>6</td>\n",
+ " <td>1</td>\n",
+ " <td>56</td>\n",
+ " <td>3</td>\n",
+ " <td>1</td>\n",
+ " <td>1.952381</td>\n",
+ " <td>0.005758</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>Neustadt</th>\n",
+ " <td>12649.0</td>\n",
+ " <td>1412.0</td>\n",
+ " <td>11.2</td>\n",
+ " <td>1919.0</td>\n",
+ " <td>15.2</td>\n",
+ " <td>8683.0</td>\n",
+ " <td>1.5</td>\n",
+ " <td>2.3</td>\n",
+ " <td>5592.0</td>\n",
+ " <td>6350.0</td>\n",
+ " <td>66.9</td>\n",
+ " <td>493.0</td>\n",
+ " <td>652.0</td>\n",
+ " <td>7700.0</td>\n",
+ " <td>63.1</td>\n",
+ " <td>38.4</td>\n",
+ " <td>992.0</td>\n",
+ " <td>12.9</td>\n",
+ " <td>2304.0</td>\n",
+ " <td>8240.0</td>\n",
+ " <td>70.0</td>\n",
+ " <td>0.9</td>\n",
+ " <td>5994.0</td>\n",
+ " <td>69.0</td>\n",
+ " <td>7015.0</td>\n",
+ " <td>242164.0</td>\n",
+ " <td>46861.0</td>\n",
+ " <td>34521.0</td>\n",
+ " <td>24715.0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>2</td>\n",
+ " <td>6</td>\n",
+ " <td>1</td>\n",
+ " <td>140</td>\n",
+ " <td>4</td>\n",
+ " <td>4</td>\n",
+ " <td>2.702381</td>\n",
+ " <td>0.001174</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>St. Pauli</th>\n",
+ " <td>22056.0</td>\n",
+ " <td>2941.0</td>\n",
+ " <td>13.3</td>\n",
+ " <td>2270.0</td>\n",
+ " <td>10.3</td>\n",
+ " <td>14772.0</td>\n",
+ " <td>1.5</td>\n",
+ " <td>2.2</td>\n",
+ " <td>9836.0</td>\n",
+ " <td>9903.0</td>\n",
+ " <td>57.2</td>\n",
+ " <td>1535.0</td>\n",
+ " <td>1293.0</td>\n",
+ " <td>12667.0</td>\n",
+ " <td>64.2</td>\n",
+ " <td>36.9</td>\n",
+ " <td>1522.0</td>\n",
+ " <td>12.0</td>\n",
+ " <td>1998.0</td>\n",
+ " <td>7716.0</td>\n",
+ " <td>173.0</td>\n",
+ " <td>1.4</td>\n",
+ " <td>10184.0</td>\n",
+ " <td>68.9</td>\n",
+ " <td>11066.0</td>\n",
+ " <td>309596.0</td>\n",
+ " <td>55589.0</td>\n",
+ " <td>27977.0</td>\n",
+ " <td>19399.0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>10</td>\n",
+ " <td>0</td>\n",
+ " <td>109</td>\n",
+ " <td>1</td>\n",
+ " <td>2</td>\n",
+ " <td>1.851190</td>\n",
+ " <td>0.009631</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>St. Georg</th>\n",
+ " <td>12318.0</td>\n",
+ " <td>1420.0</td>\n",
+ " <td>11.5</td>\n",
+ " <td>1676.0</td>\n",
+ " <td>13.6</td>\n",
+ " <td>7720.0</td>\n",
+ " <td>1.6</td>\n",
+ " <td>1.8</td>\n",
+ " <td>6758.0</td>\n",
+ " <td>5424.0</td>\n",
+ " <td>57.6</td>\n",
+ " <td>659.0</td>\n",
+ " <td>561.0</td>\n",
+ " <td>6444.0</td>\n",
+ " <td>71.1</td>\n",
+ " <td>37.2</td>\n",
+ " <td>764.0</td>\n",
+ " <td>11.9</td>\n",
+ " <td>1996.0</td>\n",
+ " <td>7961.0</td>\n",
+ " <td>63.0</td>\n",
+ " <td>1.0</td>\n",
+ " <td>5043.0</td>\n",
+ " <td>65.3</td>\n",
+ " <td>5683.0</td>\n",
+ " <td>250742.0</td>\n",
+ " <td>58371.0</td>\n",
+ " <td>44121.0</td>\n",
+ " <td>27161.0</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>11</td>\n",
+ " <td>0</td>\n",
+ " <td>80</td>\n",
+ " <td>2</td>\n",
+ " <td>4</td>\n",
+ " <td>2.523810</td>\n",
+ " <td>0.008493</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>...</th>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>Hausbruch</th>\n",
+ " <td>16868.0</td>\n",
+ " <td>3196.0</td>\n",
+ " <td>18.9</td>\n",
+ " <td>3554.0</td>\n",
+ " <td>21.1</td>\n",
+ " <td>7829.0</td>\n",
+ " <td>2.1</td>\n",
+ " <td>9.8</td>\n",
+ " <td>1715.0</td>\n",
+ " <td>6492.0</td>\n",
+ " <td>60.8</td>\n",
+ " <td>758.0</td>\n",
+ " <td>3053.0</td>\n",
+ " <td>7323.0</td>\n",
+ " <td>85.0</td>\n",
+ " <td>36.9</td>\n",
+ " <td>1030.0</td>\n",
+ " <td>14.1</td>\n",
+ " <td>541.0</td>\n",
+ " <td>3877.0</td>\n",
+ " <td>2793.0</td>\n",
+ " <td>38.1</td>\n",
+ " <td>3134.0</td>\n",
+ " <td>40.0</td>\n",
+ " <td>7349.0</td>\n",
+ " <td>227990.0</td>\n",
+ " <td>36179.0</td>\n",
+ " <td>31023.0</td>\n",
+ " <td>21355.0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0.422619</td>\n",
+ " <td>0.104116</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>Neugraben-Fischbek</th>\n",
+ " <td>33963.0</td>\n",
+ " <td>7480.0</td>\n",
+ " <td>22.0</td>\n",
+ " <td>6318.0</td>\n",
+ " <td>18.6</td>\n",
+ " <td>15602.0</td>\n",
+ " <td>2.2</td>\n",
+ " <td>22.5</td>\n",
+ " <td>1508.0</td>\n",
+ " <td>12746.0</td>\n",
+ " <td>59.9</td>\n",
+ " <td>1493.0</td>\n",
+ " <td>6669.0</td>\n",
+ " <td>14755.0</td>\n",
+ " <td>87.0</td>\n",
+ " <td>37.8</td>\n",
+ " <td>1078.0</td>\n",
+ " <td>7.3</td>\n",
+ " <td>554.0</td>\n",
+ " <td>3912.0</td>\n",
+ " <td>6124.0</td>\n",
+ " <td>41.5</td>\n",
+ " <td>6247.0</td>\n",
+ " <td>40.0</td>\n",
+ " <td>12290.0</td>\n",
+ " <td>382231.0</td>\n",
+ " <td>60244.0</td>\n",
+ " <td>31101.0</td>\n",
+ " <td>22492.0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>4</td>\n",
+ " <td>1</td>\n",
+ " <td>8</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1.857143</td>\n",
+ " <td>0.130211</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>Francop</th>\n",
+ " <td>736.0</td>\n",
+ " <td>119.0</td>\n",
+ " <td>16.2</td>\n",
+ " <td>133.0</td>\n",
+ " <td>18.1</td>\n",
+ " <td>374.0</td>\n",
+ " <td>2.0</td>\n",
+ " <td>8.8</td>\n",
+ " <td>84.0</td>\n",
+ " <td>NaN</td>\n",
+ " <td>NaN</td>\n",
+ " <td>NaN</td>\n",
+ " <td>208.0</td>\n",
+ " <td>347.0</td>\n",
+ " <td>98.5</td>\n",
+ " <td>46.4</td>\n",
+ " <td>0.0</td>\n",
+ " <td>0.0</td>\n",
+ " <td>427.0</td>\n",
+ " <td>NaN</td>\n",
+ " <td>255.0</td>\n",
+ " <td>73.5</td>\n",
+ " <td>161.0</td>\n",
+ " <td>43.0</td>\n",
+ " <td>356.0</td>\n",
+ " <td>12738.0</td>\n",
+ " <td>2083.0</td>\n",
+ " <td>35782.0</td>\n",
+ " <td>26568.0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0.000000</td>\n",
+ " <td>0.114626</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>Neuenfelde</th>\n",
+ " <td>5245.0</td>\n",
+ " <td>1251.0</td>\n",
+ " <td>23.9</td>\n",
+ " <td>748.0</td>\n",
+ " <td>14.3</td>\n",
+ " <td>2337.0</td>\n",
+ " <td>2.2</td>\n",
+ " <td>15.7</td>\n",
+ " <td>335.0</td>\n",
+ " <td>1957.0</td>\n",
+ " <td>57.1</td>\n",
+ " <td>253.0</td>\n",
+ " <td>1103.0</td>\n",
+ " <td>2037.0</td>\n",
+ " <td>95.4</td>\n",
+ " <td>37.1</td>\n",
+ " <td>309.0</td>\n",
+ " <td>15.2</td>\n",
+ " <td>401.0</td>\n",
+ " <td>NaN</td>\n",
+ " <td>1261.0</td>\n",
+ " <td>61.9</td>\n",
+ " <td>1033.0</td>\n",
+ " <td>44.2</td>\n",
+ " <td>1909.0</td>\n",
+ " <td>62765.0</td>\n",
+ " <td>10422.0</td>\n",
+ " <td>32879.0</td>\n",
+ " <td>22909.0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0.375000</td>\n",
+ " <td>0.162791</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>Cranz</th>\n",
+ " <td>810.0</td>\n",
+ " <td>130.0</td>\n",
+ " <td>16.0</td>\n",
+ " <td>154.0</td>\n",
+ " <td>19.0</td>\n",
+ " <td>460.0</td>\n",
+ " <td>1.8</td>\n",
+ " <td>1.3</td>\n",
+ " <td>608.0</td>\n",
+ " <td>NaN</td>\n",
+ " <td>NaN</td>\n",
+ " <td>NaN</td>\n",
+ " <td>170.0</td>\n",
+ " <td>405.0</td>\n",
+ " <td>87.5</td>\n",
+ " <td>43.7</td>\n",
+ " <td>0.0</td>\n",
+ " <td>0.0</td>\n",
+ " <td>451.0</td>\n",
+ " <td>NaN</td>\n",
+ " <td>146.0</td>\n",
+ " <td>36.0</td>\n",
+ " <td>252.0</td>\n",
+ " <td>54.8</td>\n",
+ " <td>378.0</td>\n",
+ " <td>11845.0</td>\n",
+ " <td>2018.0</td>\n",
+ " <td>31335.0</td>\n",
+ " <td>22852.0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0.000000</td>\n",
+ " <td>0.200686</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "<p>99 rows × 39 columns</p>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " Anzahl der Einwohner: innen \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 3182.0 \n",
+ "HafenCity 6950.0 \n",
+ "Neustadt 12649.0 \n",
+ "St. Pauli 22056.0 \n",
+ "St. Georg 12318.0 \n",
+ "... ... \n",
+ "Hausbruch 16868.0 \n",
+ "Neugraben-Fischbek 33963.0 \n",
+ "Francop 736.0 \n",
+ "Neuenfelde 5245.0 \n",
+ "Cranz 810.0 \n",
+ "\n",
+ " Anzahl der Kinder und Jugendlichen unter 18 Jahren \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 515.0 \n",
+ "HafenCity 1386.0 \n",
+ "Neustadt 1412.0 \n",
+ "St. Pauli 2941.0 \n",
+ "St. Georg 1420.0 \n",
+ "... ... \n",
+ "Hausbruch 3196.0 \n",
+ "Neugraben-Fischbek 7480.0 \n",
+ "Francop 119.0 \n",
+ "Neuenfelde 1251.0 \n",
+ "Cranz 130.0 \n",
+ "\n",
+ " Anteil Kinder und Jugendlicher unter 18 Jahren an der Gesamt-bevölkerung \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 16.2 \n",
+ "HafenCity 19.9 \n",
+ "Neustadt 11.2 \n",
+ "St. Pauli 13.3 \n",
+ "St. Georg 11.5 \n",
+ "... ... \n",
+ "Hausbruch 18.9 \n",
+ "Neugraben-Fischbek 22.0 \n",
+ "Francop 16.2 \n",
+ "Neuenfelde 23.9 \n",
+ "Cranz 16.0 \n",
+ "\n",
+ " Anzahl älterer Einwohner: innen über 64 Jahren \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 316.0 \n",
+ "HafenCity 644.0 \n",
+ "Neustadt 1919.0 \n",
+ "St. Pauli 2270.0 \n",
+ "St. Georg 1676.0 \n",
+ "... ... \n",
+ "Hausbruch 3554.0 \n",
+ "Neugraben-Fischbek 6318.0 \n",
+ "Francop 133.0 \n",
+ "Neuenfelde 748.0 \n",
+ "Cranz 154.0 \n",
+ "\n",
+ " Anteil älterer Einwohner: innen über 64 Jahren an der Gesamt-bevölkerung \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 9.9 \n",
+ "HafenCity 9.3 \n",
+ "Neustadt 15.2 \n",
+ "St. Pauli 10.3 \n",
+ "St. Georg 13.6 \n",
+ "... ... \n",
+ "Hausbruch 21.1 \n",
+ "Neugraben-Fischbek 18.6 \n",
+ "Francop 18.1 \n",
+ "Neuenfelde 14.3 \n",
+ "Cranz 19.0 \n",
+ "\n",
+ " Anzahl der Haushalte \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 1884.0 \n",
+ "HafenCity 3183.0 \n",
+ "Neustadt 8683.0 \n",
+ "St. Pauli 14772.0 \n",
+ "St. Georg 7720.0 \n",
+ "... ... \n",
+ "Hausbruch 7829.0 \n",
+ "Neugraben-Fischbek 15602.0 \n",
+ "Francop 374.0 \n",
+ "Neuenfelde 2337.0 \n",
+ "Cranz 460.0 \n",
+ "\n",
+ " Durch-schnittliche Anzahl der Personen je Haushalt \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 1.7 \n",
+ "HafenCity 2.2 \n",
+ "Neustadt 1.5 \n",
+ "St. Pauli 1.5 \n",
+ "St. Georg 1.6 \n",
+ "... ... \n",
+ "Hausbruch 2.1 \n",
+ "Neugraben-Fischbek 2.2 \n",
+ "Francop 2.0 \n",
+ "Neuenfelde 2.2 \n",
+ "Cranz 1.8 \n",
+ "\n",
+ " Fläche in km² Einwohner: innen je km² \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 1.3 2447.0 \n",
+ "HafenCity 2.4 2865.0 \n",
+ "Neustadt 2.3 5592.0 \n",
+ "St. Pauli 2.2 9836.0 \n",
+ "St. Georg 1.8 6758.0 \n",
+ "... ... ... \n",
+ "Hausbruch 9.8 1715.0 \n",
+ "Neugraben-Fischbek 22.5 1508.0 \n",
+ "Francop 8.8 84.0 \n",
+ "Neuenfelde 15.7 335.0 \n",
+ "Cranz 1.3 608.0 \n",
+ "\n",
+ " Sozial-versicherungs-pflichtig Beschäftigte am Wohnort \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 1346.0 \n",
+ "HafenCity 3087.0 \n",
+ "Neustadt 6350.0 \n",
+ "St. Pauli 9903.0 \n",
+ "St. Georg 5424.0 \n",
+ "... ... \n",
+ "Hausbruch 6492.0 \n",
+ "Neugraben-Fischbek 12746.0 \n",
+ "Francop NaN \n",
+ "Neuenfelde 1957.0 \n",
+ "Cranz NaN \n",
+ "\n",
+ " Anteil sozial-versicherungs-pflichtig Beschäftigter am Wohnort an den Erwerbs-fähigen (15 bis unter 65-Jährige) \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 55.6 \n",
+ "HafenCity 61.6 \n",
+ "Neustadt 66.9 \n",
+ "St. Pauli 57.2 \n",
+ "St. Georg 57.6 \n",
+ "... ... \n",
+ "Hausbruch 60.8 \n",
+ "Neugraben-Fischbek 59.9 \n",
+ "Francop NaN \n",
+ "Neuenfelde 57.1 \n",
+ "Cranz NaN \n",
+ "\n",
+ " Anzahl der Arbeitslosen Anzahl der Wohngebäude \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 324.0 103.0 \n",
+ "HafenCity 147.0 141.0 \n",
+ "Neustadt 493.0 652.0 \n",
+ "St. Pauli 1535.0 1293.0 \n",
+ "St. Georg 659.0 561.0 \n",
+ "... ... ... \n",
+ "Hausbruch 758.0 3053.0 \n",
+ "Neugraben-Fischbek 1493.0 6669.0 \n",
+ "Francop NaN 208.0 \n",
+ "Neuenfelde 253.0 1103.0 \n",
+ "Cranz NaN 170.0 \n",
+ "\n",
+ " Anzahl der Wohnungen \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 1487.0 \n",
+ "HafenCity 3898.0 \n",
+ "Neustadt 7700.0 \n",
+ "St. Pauli 12667.0 \n",
+ "St. Georg 6444.0 \n",
+ "... ... \n",
+ "Hausbruch 7323.0 \n",
+ "Neugraben-Fischbek 14755.0 \n",
+ "Francop 347.0 \n",
+ "Neuenfelde 2037.0 \n",
+ "Cranz 405.0 \n",
+ "\n",
+ " Durch-schnittliche Wohnungs-größe in m² \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 74.1 \n",
+ "HafenCity 81.4 \n",
+ "Neustadt 63.1 \n",
+ "St. Pauli 64.2 \n",
+ "St. Georg 71.1 \n",
+ "... ... \n",
+ "Hausbruch 85.0 \n",
+ "Neugraben-Fischbek 87.0 \n",
+ "Francop 98.5 \n",
+ "Neuenfelde 95.4 \n",
+ "Cranz 87.5 \n",
+ "\n",
+ " Durch-schnittliche Wohnfläche je Einwohner:in in m² \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 34.6 \n",
+ "HafenCity 45.7 \n",
+ "Neustadt 38.4 \n",
+ "St. Pauli 36.9 \n",
+ "St. Georg 37.2 \n",
+ "... ... \n",
+ "Hausbruch 36.9 \n",
+ "Neugraben-Fischbek 37.8 \n",
+ "Francop 46.4 \n",
+ "Neuenfelde 37.1 \n",
+ "Cranz 43.7 \n",
+ "\n",
+ " Anzahl der Sozial-wohnungen \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 176.0 \n",
+ "HafenCity 1074.0 \n",
+ "Neustadt 992.0 \n",
+ "St. Pauli 1522.0 \n",
+ "St. Georg 764.0 \n",
+ "... ... \n",
+ "Hausbruch 1030.0 \n",
+ "Neugraben-Fischbek 1078.0 \n",
+ "Francop 0.0 \n",
+ "Neuenfelde 309.0 \n",
+ "Cranz 0.0 \n",
+ "\n",
+ " Anteil der Sozial-wohnungen an allen Wohnungen \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 11.8 \n",
+ "HafenCity 27.6 \n",
+ "Neustadt 12.9 \n",
+ "St. Pauli 12.0 \n",
+ "St. Georg 11.9 \n",
+ "... ... \n",
+ "Hausbruch 14.1 \n",
+ "Neugraben-Fischbek 7.3 \n",
+ "Francop 0.0 \n",
+ "Neuenfelde 15.2 \n",
+ "Cranz 0.0 \n",
+ "\n",
+ " Durch-schnittlicher Immobilien-preis für ein Grundstück in EUR/m² \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 2366.0 \n",
+ "HafenCity 3031.0 \n",
+ "Neustadt 2304.0 \n",
+ "St. Pauli 1998.0 \n",
+ "St. Georg 1996.0 \n",
+ "... ... \n",
+ "Hausbruch 541.0 \n",
+ "Neugraben-Fischbek 554.0 \n",
+ "Francop 427.0 \n",
+ "Neuenfelde 401.0 \n",
+ "Cranz 451.0 \n",
+ "\n",
+ " Durch-schnittlicher Immobilien-preis für eine Eigentums-wohnung in EUR/m² \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 4869.0 \n",
+ "HafenCity 10746.0 \n",
+ "Neustadt 8240.0 \n",
+ "St. Pauli 7716.0 \n",
+ "St. Georg 7961.0 \n",
+ "... ... \n",
+ "Hausbruch 3877.0 \n",
+ "Neugraben-Fischbek 3912.0 \n",
+ "Francop NaN \n",
+ "Neuenfelde NaN \n",
+ "Cranz NaN \n",
+ "\n",
+ " Anzahl der Wohnungen in Ein- und Zweifamilien-häusern \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 17.0 \n",
+ "HafenCity 5.0 \n",
+ "Neustadt 70.0 \n",
+ "St. Pauli 173.0 \n",
+ "St. Georg 63.0 \n",
+ "... ... \n",
+ "Hausbruch 2793.0 \n",
+ "Neugraben-Fischbek 6124.0 \n",
+ "Francop 255.0 \n",
+ "Neuenfelde 1261.0 \n",
+ "Cranz 146.0 \n",
+ "\n",
+ " Anteil der Wohnungen in Ein- und Zweifamilien-häusern an allen Wohnungen \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 1.1 \n",
+ "HafenCity 0.1 \n",
+ "Neustadt 0.9 \n",
+ "St. Pauli 1.4 \n",
+ "St. Georg 1.0 \n",
+ "... ... \n",
+ "Hausbruch 38.1 \n",
+ "Neugraben-Fischbek 41.5 \n",
+ "Francop 73.5 \n",
+ "Neuenfelde 61.9 \n",
+ "Cranz 36.0 \n",
+ "\n",
+ " Anzahl der Einpersonen-haushalte \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 1057.0 \n",
+ "HafenCity 1126.0 \n",
+ "Neustadt 5994.0 \n",
+ "St. Pauli 10184.0 \n",
+ "St. Georg 5043.0 \n",
+ "... ... \n",
+ "Hausbruch 3134.0 \n",
+ "Neugraben-Fischbek 6247.0 \n",
+ "Francop 161.0 \n",
+ "Neuenfelde 1033.0 \n",
+ "Cranz 252.0 \n",
+ "\n",
+ " Anteil der Haushalte, in denen nur eine Person lebt, an allen Haushalten \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 56.1 \n",
+ "HafenCity 35.4 \n",
+ "Neustadt 69.0 \n",
+ "St. Pauli 68.9 \n",
+ "St. Georg 65.3 \n",
+ "... ... \n",
+ "Hausbruch 40.0 \n",
+ "Neugraben-Fischbek 40.0 \n",
+ "Francop 43.0 \n",
+ "Neuenfelde 44.2 \n",
+ "Cranz 54.8 \n",
+ "\n",
+ " Gesamtbetrag der Einkünfte - [Steuerpflichtig] \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 1952.0 \n",
+ "HafenCity 1255.0 \n",
+ "Neustadt 7015.0 \n",
+ "St. Pauli 11066.0 \n",
+ "St. Georg 5683.0 \n",
+ "... ... \n",
+ "Hausbruch 7349.0 \n",
+ "Neugraben-Fischbek 12290.0 \n",
+ "Francop 356.0 \n",
+ "Neuenfelde 1909.0 \n",
+ "Cranz 378.0 \n",
+ "\n",
+ " Gesamtbetrag der Einkünfte - [1000€] \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 61168.0 \n",
+ "HafenCity 116973.0 \n",
+ "Neustadt 242164.0 \n",
+ "St. Pauli 309596.0 \n",
+ "St. Georg 250742.0 \n",
+ "... ... \n",
+ "Hausbruch 227990.0 \n",
+ "Neugraben-Fischbek 382231.0 \n",
+ "Francop 12738.0 \n",
+ "Neuenfelde 62765.0 \n",
+ "Cranz 11845.0 \n",
+ "\n",
+ " Festgesetzte Einkommenssteuer/ Jahreslohnsteuer - [1000€] \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 11577.0 \n",
+ "HafenCity 34051.0 \n",
+ "Neustadt 46861.0 \n",
+ "St. Pauli 55589.0 \n",
+ "St. Georg 58371.0 \n",
+ "... ... \n",
+ "Hausbruch 36179.0 \n",
+ "Neugraben-Fischbek 60244.0 \n",
+ "Francop 2083.0 \n",
+ "Neuenfelde 10422.0 \n",
+ "Cranz 2018.0 \n",
+ "\n",
+ " Gesamtbetrag Einkünfte Mittelwert - [€] \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 31336.0 \n",
+ "HafenCity 93206.0 \n",
+ "Neustadt 34521.0 \n",
+ "St. Pauli 27977.0 \n",
+ "St. Georg 44121.0 \n",
+ "... ... \n",
+ "Hausbruch 31023.0 \n",
+ "Neugraben-Fischbek 31101.0 \n",
+ "Francop 35782.0 \n",
+ "Neuenfelde 32879.0 \n",
+ "Cranz 31335.0 \n",
+ "\n",
+ " Gesamtbetrag Einkünfte Median - [€] market_count \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 10811.0 2 \n",
+ "HafenCity 57913.0 1 \n",
+ "Neustadt 24715.0 1 \n",
+ "St. Pauli 19399.0 1 \n",
+ "St. Georg 27161.0 2 \n",
+ "... ... ... \n",
+ "Hausbruch 21355.0 0 \n",
+ "Neugraben-Fischbek 22492.0 1 \n",
+ "Francop 26568.0 0 \n",
+ "Neuenfelde 22909.0 0 \n",
+ "Cranz 22852.0 0 \n",
+ "\n",
+ " farms_count greencrocers_count supermarkets_count \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 0 1 4 \n",
+ "HafenCity 0 0 6 \n",
+ "Neustadt 0 2 6 \n",
+ "St. Pauli 0 1 10 \n",
+ "St. Georg 0 0 11 \n",
+ "... ... ... ... \n",
+ "Hausbruch 0 1 1 \n",
+ "Neugraben-Fischbek 0 0 4 \n",
+ "Francop 0 0 0 \n",
+ "Neuenfelde 0 1 0 \n",
+ "Cranz 0 0 0 \n",
+ "\n",
+ " biosupermarkets_count all_restaurants_count \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 0 129 \n",
+ "HafenCity 1 56 \n",
+ "Neustadt 1 140 \n",
+ "St. Pauli 0 109 \n",
+ "St. Georg 0 80 \n",
+ "... ... ... \n",
+ "Hausbruch 0 1 \n",
+ "Neugraben-Fischbek 1 8 \n",
+ "Francop 0 0 \n",
+ "Neuenfelde 0 2 \n",
+ "Cranz 0 2 \n",
+ "\n",
+ " organic_restaurants_count vegan_restaurants_count \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 0 4 \n",
+ "HafenCity 3 1 \n",
+ "Neustadt 4 4 \n",
+ "St. Pauli 1 2 \n",
+ "St. Georg 2 4 \n",
+ "... ... ... \n",
+ "Hausbruch 0 0 \n",
+ "Neugraben-Fischbek 0 0 \n",
+ "Francop 0 0 \n",
+ "Neuenfelde 0 0 \n",
+ "Cranz 0 0 \n",
+ "\n",
+ " art_score distance_rathaus \n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 2.565476 0.000000 \n",
+ "HafenCity 1.952381 0.005758 \n",
+ "Neustadt 2.702381 0.001174 \n",
+ "St. Pauli 1.851190 0.009631 \n",
+ "St. Georg 2.523810 0.008493 \n",
+ "... ... ... \n",
+ "Hausbruch 0.422619 0.104116 \n",
+ "Neugraben-Fischbek 1.857143 0.130211 \n",
+ "Francop 0.000000 0.114626 \n",
+ "Neuenfelde 0.375000 0.162791 \n",
+ "Cranz 0.000000 0.200686 \n",
+ "\n",
+ "[99 rows x 39 columns]"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#----------------Merge the two dataframe (dcount_df, selected_df)------------------\n",
+ "\n",
+ "#path1 = \"C:/Users/HOME/Git/project-version-2-1/Data/final_data.csv\"\n",
+ "path1 = Data/final_data.csv\n",
+ "final_df = pd.read_csv(path1)\n",
+ "\n",
+ "final_df.columns.values[0] = \"stadtteil\"\n",
+ "final_df.set_index(final_df.columns[0], inplace=True)\n",
+ "final_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Data pre-processing (replacing NaNs into column averages)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "total number of missing values in no_art_merged_df: 0\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "#--------------------------- pre-processing for data: independent variables and dependant variables('weekend markets')----------------#\n",
+ "\n",
+ "#replace NaNs to column-average \n",
+ "\n",
+ "column_means = final_df.mean()\n",
+ "\n",
+ "merged_df = final_df.fillna(column_means)\n",
+ "\n",
+ "\n",
+ "#check if there are left NaNs in no_art_merged_df\n",
+ "\n",
+ "total_missing_values = merged_df.isna().sum().sum()\n",
+ "\n",
+ "print(\"total number of missing values in no_art_merged_df:\", total_missing_values)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Set 11 variables for predicting weekendmarkets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>greencrocers_count</th>\n",
+ " <th>supermarkets_count</th>\n",
+ " <th>biosupermarkets_count</th>\n",
+ " <th>all_restaurants_count</th>\n",
+ " <th>organic_restaurants_count</th>\n",
+ " <th>Einwohner: innen je km²</th>\n",
+ " <th>distance_rathaus</th>\n",
+ " <th>Gesamtbetrag Einkünfte Median - [€]</th>\n",
+ " <th>Anteil der Sozial-wohnungen an allen Wohnungen</th>\n",
+ " <th>vegan_restaurants_count</th>\n",
+ " <th>market_count</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>stadtteil</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>Hamburg-Altstadt</th>\n",
+ " <td>1</td>\n",
+ " <td>4</td>\n",
+ " <td>0</td>\n",
+ " <td>129</td>\n",
+ " <td>0</td>\n",
+ " <td>2447.0</td>\n",
+ " <td>0.000000</td>\n",
+ " <td>10811.0</td>\n",
+ " <td>11.8</td>\n",
+ " <td>4</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>HafenCity</th>\n",
+ " <td>0</td>\n",
+ " <td>6</td>\n",
+ " <td>1</td>\n",
+ " <td>56</td>\n",
+ " <td>3</td>\n",
+ " <td>2865.0</td>\n",
+ " <td>0.005758</td>\n",
+ " <td>57913.0</td>\n",
+ " <td>27.6</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>Neustadt</th>\n",
+ " <td>2</td>\n",
+ " <td>6</td>\n",
+ " <td>1</td>\n",
+ " <td>140</td>\n",
+ " <td>4</td>\n",
+ " <td>5592.0</td>\n",
+ " <td>0.001174</td>\n",
+ " <td>24715.0</td>\n",
+ " <td>12.9</td>\n",
+ " <td>4</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>St. Pauli</th>\n",
+ " <td>1</td>\n",
+ " <td>10</td>\n",
+ " <td>0</td>\n",
+ " <td>109</td>\n",
+ " <td>1</td>\n",
+ " <td>9836.0</td>\n",
+ " <td>0.009631</td>\n",
+ " <td>19399.0</td>\n",
+ " <td>12.0</td>\n",
+ " <td>2</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>St. Georg</th>\n",
+ " <td>0</td>\n",
+ " <td>11</td>\n",
+ " <td>0</td>\n",
+ " <td>80</td>\n",
+ " <td>2</td>\n",
+ " <td>6758.0</td>\n",
+ " <td>0.008493</td>\n",
+ " <td>27161.0</td>\n",
+ " <td>11.9</td>\n",
+ " <td>4</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>...</th>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>Hausbruch</th>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>1715.0</td>\n",
+ " <td>0.104116</td>\n",
+ " <td>21355.0</td>\n",
+ " <td>14.1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>Neugraben-Fischbek</th>\n",
+ " <td>0</td>\n",
+ " <td>4</td>\n",
+ " <td>1</td>\n",
+ " <td>8</td>\n",
+ " <td>0</td>\n",
+ " <td>1508.0</td>\n",
+ " <td>0.130211</td>\n",
+ " <td>22492.0</td>\n",
+ " <td>7.3</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>Francop</th>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>84.0</td>\n",
+ " <td>0.114626</td>\n",
+ " <td>26568.0</td>\n",
+ " <td>0.0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>Neuenfelde</th>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>335.0</td>\n",
+ " <td>0.162791</td>\n",
+ " <td>22909.0</td>\n",
+ " <td>15.2</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>Cranz</th>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>2</td>\n",
+ " <td>0</td>\n",
+ " <td>608.0</td>\n",
+ " <td>0.200686</td>\n",
+ " <td>22852.0</td>\n",
+ " <td>0.0</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "<p>99 rows × 11 columns</p>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " greencrocers_count supermarkets_count \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 1 4 \n",
+ "HafenCity 0 6 \n",
+ "Neustadt 2 6 \n",
+ "St. Pauli 1 10 \n",
+ "St. Georg 0 11 \n",
+ "... ... ... \n",
+ "Hausbruch 1 1 \n",
+ "Neugraben-Fischbek 0 4 \n",
+ "Francop 0 0 \n",
+ "Neuenfelde 1 0 \n",
+ "Cranz 0 0 \n",
+ "\n",
+ " biosupermarkets_count all_restaurants_count \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 0 129 \n",
+ "HafenCity 1 56 \n",
+ "Neustadt 1 140 \n",
+ "St. Pauli 0 109 \n",
+ "St. Georg 0 80 \n",
+ "... ... ... \n",
+ "Hausbruch 0 1 \n",
+ "Neugraben-Fischbek 1 8 \n",
+ "Francop 0 0 \n",
+ "Neuenfelde 0 2 \n",
+ "Cranz 0 2 \n",
+ "\n",
+ " organic_restaurants_count Einwohner: innen je km² \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 0 2447.0 \n",
+ "HafenCity 3 2865.0 \n",
+ "Neustadt 4 5592.0 \n",
+ "St. Pauli 1 9836.0 \n",
+ "St. Georg 2 6758.0 \n",
+ "... ... ... \n",
+ "Hausbruch 0 1715.0 \n",
+ "Neugraben-Fischbek 0 1508.0 \n",
+ "Francop 0 84.0 \n",
+ "Neuenfelde 0 335.0 \n",
+ "Cranz 0 608.0 \n",
+ "\n",
+ " distance_rathaus Gesamtbetrag Einkünfte Median - [€] \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 0.000000 10811.0 \n",
+ "HafenCity 0.005758 57913.0 \n",
+ "Neustadt 0.001174 24715.0 \n",
+ "St. Pauli 0.009631 19399.0 \n",
+ "St. Georg 0.008493 27161.0 \n",
+ "... ... ... \n",
+ "Hausbruch 0.104116 21355.0 \n",
+ "Neugraben-Fischbek 0.130211 22492.0 \n",
+ "Francop 0.114626 26568.0 \n",
+ "Neuenfelde 0.162791 22909.0 \n",
+ "Cranz 0.200686 22852.0 \n",
+ "\n",
+ " Anteil der Sozial-wohnungen an allen Wohnungen \\\n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 11.8 \n",
+ "HafenCity 27.6 \n",
+ "Neustadt 12.9 \n",
+ "St. Pauli 12.0 \n",
+ "St. Georg 11.9 \n",
+ "... ... \n",
+ "Hausbruch 14.1 \n",
+ "Neugraben-Fischbek 7.3 \n",
+ "Francop 0.0 \n",
+ "Neuenfelde 15.2 \n",
+ "Cranz 0.0 \n",
+ "\n",
+ " vegan_restaurants_count market_count \n",
+ "stadtteil \n",
+ "Hamburg-Altstadt 4 2 \n",
+ "HafenCity 1 1 \n",
+ "Neustadt 4 1 \n",
+ "St. Pauli 2 1 \n",
+ "St. Georg 4 2 \n",
+ "... ... ... \n",
+ "Hausbruch 0 0 \n",
+ "Neugraben-Fischbek 0 1 \n",
+ "Francop 0 0 \n",
+ "Neuenfelde 0 0 \n",
+ "Cranz 0 0 \n",
+ "\n",
+ "[99 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "## -------------------------- 10 variables to use for predicting weekendmarkets form Discussion ------------------------ ##\n",
+ "\n",
+ "# Name of variables for weekendmarkets (independent, dependent)\n",
+ "var_markets = ['greencrocers_count', 'supermarkets_count', 'biosupermarkets_count', 'all_restaurants_count', 'organic_restaurants_count',\n",
+ " 'Einwohner: innen je km²', 'distance_rathaus', 'Gesamtbetrag Einkünfte Median - [€]', 'Anteil der Sozial-wohnungen an allen Wohnungen',\n",
+ " 'vegan_restaurants_count', 'market_count']\n",
+ "\n",
+ "# Filter stadtteilprofil for these columns\n",
+ "merged_df = merged_df[var_markets]\n",
+ "#sprofile_df.shape\n",
+ "merged_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Make iteration for making random models & Set dependant variable (Weekendmarkets)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "num_repeats = 100\n",
+ "\n",
+ "X = merged_df.drop(columns=[\"market_count\"]) # independent variables\n",
+ "y = merged_df[\"market_count\"] # dependent variables"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Random Forest Method (Variable Selection 1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.metrics import accuracy_score\n",
+ "#------------------------Make records for each iterations-----------------------\n",
+ "\n",
+ "rf_accuracies_list = []\n",
+ "rf_iteration_importances_df = pd.DataFrame(columns=X.columns)\n",
+ "\n",
+ "\n",
+ "for i in range(num_repeats):\n",
+ " #--------------------------Splitting the data into training and testing sets (80:20 ratio)-----------------------------------\n",
+ " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)\n",
+ "\n",
+ "\n",
+ " # Making Random Forest Model and Training\n",
+ " rf_model = RandomForestClassifier(n_estimators=100, random_state=i) # use 100 decision trees\n",
+ " rf_model.fit(X_train, y_train)\n",
+ "\n",
+ " # check the importance of variables\n",
+ " rf_importances = rf_model.feature_importances_\n",
+ "\n",
+ "\n",
+ " #------------------------------------------------------------------------------------\n",
+ " \n",
+ " # record the importance of variables\n",
+ " rf_iteration_importances_df.loc[i] = rf_importances\n",
+ "\n",
+ " #------------------------------Accuracy for RF modeling--------------------------------------------\n",
+ "\n",
+ " # Checking accuracy on testing data\n",
+ " rf_y_pred = rf_model.predict(X_test)\n",
+ " rf_accuracy = accuracy_score(y_test, rf_y_pred)\n",
+ " #print(\"Random Forest Accuracy (using all features) on testing data:\", rf_accuracy)\n",
+ " rf_accuracies_list.append(rf_accuracy)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RF model features and avg_importances:\n",
+ "all_restaurants_count 0.219926\n",
+ "supermarkets_count 0.186518\n",
+ "Anteil der Sozial-wohnungen an allen Wohnungen 0.119212\n",
+ "Einwohner: innen je km² 0.118131\n",
+ "Gesamtbetrag Einkünfte Median - [€] 0.114430\n",
+ "distance_rathaus 0.108481\n",
+ "greencrocers_count 0.065882\n",
+ "biosupermarkets_count 0.033396\n",
+ "vegan_restaurants_count 0.024088\n",
+ "organic_restaurants_count 0.009936\n",
+ "dtype: float64\n",
+ "Random Forest avg_accuracy: 0.6145\n"
+ ]
+ }
+ ],
+ "source": [
+ "#---------------------average importances and accuracies----------------------#\n",
+ "#rf_accuracies_list\n",
+ "#rf_iteration_importances_df\n",
+ "\n",
+ "# calculate the average importance of features\n",
+ "rf_feature_mean_importances = rf_iteration_importances_df.mean(axis=0)\n",
+ "rf_feature_mean_importances\n",
+ "\n",
+ "rf_feature_mean_importances_sorted = rf_feature_mean_importances.sort_values(ascending=False)\n",
+ "print('RF model features and avg_importances:')\n",
+ "print(rf_feature_mean_importances_sorted)\n",
+ "\n",
+ "# calculate the average accuracies of rf model\n",
+ "\n",
+ "rf_avg_accuracy = np.mean(rf_accuracies_list)\n",
+ "print(\"Random Forest avg_accuracy:\", rf_avg_accuracy )\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Gradient Boosting Tree (Variable Selection 2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.ensemble import GradientBoostingClassifier\n",
+ "from sklearn.feature_selection import SelectFromModel\n",
+ "\n",
+ "#------------------------Make records for each iterations-----------------------\n",
+ "\n",
+ "gbt_accuracies_list = []\n",
+ "gbt_iteration_importances_df = pd.DataFrame(columns=X.columns)\n",
+ "\n",
+ "for i in range(num_repeats):\n",
+ " #--------------------------Splitting the data into training and testing sets (80:20 ratio)-----------------------------------\n",
+ " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)\n",
+ "\n",
+ " # create Gradient Boosting Tree model\n",
+ " gbt_model = GradientBoostingClassifier(n_estimators=100, random_state=i) # using 100 trees, random seed 42\n",
+ " gbt_model.fit(X_train, y_train)\n",
+ "\n",
+ " # all variables and their importances\n",
+ " gbt_importances = gbt_model.feature_importances_\n",
+ "\n",
+ " # record the importance of variables\n",
+ " gbt_iteration_importances_df.loc[i] = gbt_importances\n",
+ "\n",
+ " #---------------------------------Accuracy for GBT modeling------------(with using same training data)\n",
+ "\n",
+ " # Checking accuracy on testing data for GBT with all features\n",
+ " gbt_y_pred = gbt_model.predict(X_test)\n",
+ " gbt_accuracy = accuracy_score(y_test, gbt_y_pred)\n",
+ " #print(\"Gradient Boosting tree Accuracy (using all features) on testing data:\", gbt_accuracy)\n",
+ " gbt_accuracies_list.append(gbt_accuracy)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "GBT model features and avg_importances:\n",
+ "all_restaurants_count 0.291120\n",
+ "supermarkets_count 0.221202\n",
+ "Gesamtbetrag Einkünfte Median - [€] 0.122629\n",
+ "Anteil der Sozial-wohnungen an allen Wohnungen 0.111979\n",
+ "distance_rathaus 0.104781\n",
+ "Einwohner: innen je km² 0.070611\n",
+ "greencrocers_count 0.043105\n",
+ "biosupermarkets_count 0.019531\n",
+ "vegan_restaurants_count 0.007623\n",
+ "organic_restaurants_count 0.007419\n",
+ "dtype: float64\n",
+ "GBT avg_accuracy: 0.6195\n"
+ ]
+ }
+ ],
+ "source": [
+ "#---------------------average importances and accuracies----------------------#\n",
+ "#gbt_accuracies_list\n",
+ "#gbt_iteration_importances_df\n",
+ "\n",
+ "# calculate the average importance of features\n",
+ "gbt_feature_mean_importances = gbt_iteration_importances_df.mean(axis=0)\n",
+ "gbt_feature_mean_importances\n",
+ "\n",
+ "#sort the features from highest importance to low\n",
+ "gbt_feature_mean_importances_sorted = gbt_feature_mean_importances.sort_values(ascending=False)\n",
+ "print('GBT model features and avg_importances:')\n",
+ "print(gbt_feature_mean_importances_sorted)\n",
+ "\n",
+ "# calculate the average accuracies of gbt model\n",
+ "\n",
+ "gbt_avg_accuracy = np.mean(gbt_accuracies_list)\n",
+ "print(\"GBT avg_accuracy:\", gbt_avg_accuracy )\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Recursive Feature Elimination with Random Forest (Variable Selection 3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.feature_selection import RFE\n",
+ "\n",
+ "#------------------------Make records for each iterations-----------------------\n",
+ "\n",
+ "rfe1_accuracies_list = []\n",
+ "rfe1_iteration_importances_df = pd.DataFrame(columns=X.columns)\n",
+ "\n",
+ "for i in range(num_repeats):\n",
+ " #--------------------------Splitting the data into training and testing sets (80:20 ratio)-----------------------------------\n",
+ " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)\n",
+ "\n",
+ " #--------------------------RF modeling with RFE----------------------\n",
+ "\n",
+ " # create Random Forest model2\n",
+ " rf2_model = RandomForestClassifier(n_estimators=100, random_state=i) # using 100 trees, random seed = 42\n",
+ "\n",
+ " # variable selecting with Recursive Feature Elimination (RFE) & train model\n",
+ " rfe1 = RFE(estimator=rf2_model, n_features_to_select=10, step=1)\n",
+ " rfe1.fit(X_train, y_train)\n",
+ "\n",
+ " # indices of RFE_RF_selected variables\n",
+ " rfe1_selected_indices = rfe1.support_\n",
+ "\n",
+ " # retrain the RandomForest model with RFE_RF_selected variables\n",
+ "\n",
+ " rfe1_X_selected_train = X_train.iloc[:, rfe1_selected_indices] # only RFE_RF_selected variables\n",
+ " rfe1_X_selected_test = X_test.iloc[:, rfe1_selected_indices]\n",
+ " rf2_model.fit(rfe1_X_selected_train, y_train) # retrain the model\n",
+ "\n",
+ " # print RFE_RF_selected variables and their importances after retraining\n",
+ " rfe1_selected_features = X_train.columns[rfe1_selected_indices]\n",
+ " rfe1_selected_importances = rf2_model.feature_importances_\n",
+ "\n",
+ " # record the importance of variables\n",
+ " rfe1_iteration_importances_df.loc[i] = rfe1_selected_importances\n",
+ "\n",
+ " # check the accuracy of RFE_RF_model\n",
+ "\n",
+ " rfe1_y_pred = rf2_model.predict(rfe1_X_selected_test)\n",
+ " rfe1_accuracy = accuracy_score(y_test, rfe1_y_pred)\n",
+ " #print(\"\\nRFE_RF_Accuracy:\", rfe1_accuracy)\n",
+ " rfe1_accuracies_list.append(rfe1_accuracy)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RFE_RF model features and avg_importances:\n",
+ "all_restaurants_count 0.219926\n",
+ "supermarkets_count 0.186518\n",
+ "Anteil der Sozial-wohnungen an allen Wohnungen 0.119212\n",
+ "Einwohner: innen je km² 0.118131\n",
+ "Gesamtbetrag Einkünfte Median - [€] 0.114430\n",
+ "distance_rathaus 0.108481\n",
+ "greencrocers_count 0.065882\n",
+ "biosupermarkets_count 0.033396\n",
+ "vegan_restaurants_count 0.024088\n",
+ "organic_restaurants_count 0.009936\n",
+ "dtype: float64\n",
+ "RFE_RF avg_accuracy: 0.6145\n"
+ ]
+ }
+ ],
+ "source": [
+ "#---------------------average importances and accuracies----------------------#\n",
+ "#rfe1_accuracies_list\n",
+ "#rfe1_iteration_importances_df\n",
+ "\n",
+ "# calculate the average importance of features\n",
+ "rfe1_feature_mean_importances = rfe1_iteration_importances_df.mean(axis=0)\n",
+ "rfe1_feature_mean_importances\n",
+ "\n",
+ "#sort the features from highest importance to low\n",
+ "rfe1_feature_mean_importances_sorted = rfe1_feature_mean_importances.sort_values(ascending=False)\n",
+ "print('RFE_RF model features and avg_importances:')\n",
+ "print(rfe1_feature_mean_importances_sorted)\n",
+ "\n",
+ "# calculate the average accuracies of rfe1 model\n",
+ "\n",
+ "rfe1_avg_accuracy = np.mean(rfe1_accuracies_list)\n",
+ "print(\"RFE_RF avg_accuracy:\", rfe1_avg_accuracy )\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Recursive Feature Elimination with Gradient Boosting Tree (Variable Selection 4)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#------------------------Make records for each iterations-----------------------\n",
+ "\n",
+ "rfe2_accuracies_list = []\n",
+ "rfe2_iteration_importances_df = pd.DataFrame(columns=X.columns)\n",
+ "\n",
+ "for i in range(num_repeats):\n",
+ " #--------------------------Splitting the data into training and testing sets (80:20 ratio)-----------------------------------\n",
+ " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)\n",
+ "\n",
+ " #--------------------------GBT modeling with RFE----------------------\n",
+ "\n",
+ " # Create Gradient Boosting Tree model\n",
+ " gbt2_model = GradientBoostingClassifier(n_estimators=100, random_state=i) # using 100 trees, random seed = 42\n",
+ "\n",
+ " # variable selection using Recursive Feature Elimination (RFE) and train the RFE_GBT_model\n",
+ " rfe2 = RFE(estimator=gbt2_model, n_features_to_select=10, step=1)\n",
+ " rfe2.fit(X_train, y_train)\n",
+ "\n",
+ " # retrain Gradient Boosting Tree with RFE_GBT_selected variables\n",
+ " rfe2_X_selected_train = X_train.iloc[:, rfe2.support_] # only RFE_GBT_selected variables\n",
+ " rfe2_X_selected_test = X_test.iloc[:, rfe2.support_]\n",
+ " gbt2_model.fit(rfe2_X_selected_train, y_train) # retrain the model\n",
+ "\n",
+ " # print RFE_GBT_selected variables and their importances\n",
+ " rfe2_selected_features = X_train.columns[rfe2.support_]\n",
+ " rfe2_selected_importances = gbt2_model.feature_importances_\n",
+ "\n",
+ " # record the importance of variables\n",
+ " rfe2_iteration_importances_df.loc[i] = rfe2_selected_importances\n",
+ "\n",
+ " # check the accuracy of RFE_RF_model\n",
+ "\n",
+ " rfe2_y_pred = gbt2_model.predict(rfe2_X_selected_test)\n",
+ " rfe2_accuracy = accuracy_score(y_test, rfe2_y_pred)\n",
+ " #print(\"\\nRFE_RF_Accuracy:\", rfe1_accuracy)\n",
+ " rfe2_accuracies_list.append(rfe2_accuracy)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RFE_GBT model features and avg_importances:\n",
+ "all_restaurants_count 0.291120\n",
+ "supermarkets_count 0.221202\n",
+ "Gesamtbetrag Einkünfte Median - [€] 0.122629\n",
+ "Anteil der Sozial-wohnungen an allen Wohnungen 0.111979\n",
+ "distance_rathaus 0.104781\n",
+ "Einwohner: innen je km² 0.070611\n",
+ "greencrocers_count 0.043105\n",
+ "biosupermarkets_count 0.019531\n",
+ "vegan_restaurants_count 0.007623\n",
+ "organic_restaurants_count 0.007419\n",
+ "dtype: float64\n",
+ "RFE_GBT avg_accuracy: 0.6195\n"
+ ]
+ }
+ ],
+ "source": [
+ "#---------------------average importances and accuracies----------------------#\n",
+ "#rfe2_accuracies_list\n",
+ "#rfe2_iteration_importances_df\n",
+ "\n",
+ "# calculate the average importance of features\n",
+ "rfe2_feature_mean_importances = rfe2_iteration_importances_df.mean(axis=0)\n",
+ "rfe2_feature_mean_importances\n",
+ "\n",
+ "#sort the features from highest importance to low\n",
+ "rfe2_feature_mean_importances_sorted = rfe2_feature_mean_importances.sort_values(ascending=False)\n",
+ "print('RFE_GBT model features and avg_importances:')\n",
+ "print(rfe2_feature_mean_importances_sorted)\n",
+ "\n",
+ "# calculate the average accuracies of rfe1 model\n",
+ "\n",
+ "rfe2_avg_accuracy = np.mean(rfe2_accuracies_list)\n",
+ "print(\"RFE_GBT avg_accuracy:\", rfe2_avg_accuracy )\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Comparing selected variables from all 4 methods"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>Feature</th>\n",
+ " <th>Mean_Importance</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>all_restaurants_count</td>\n",
+ " <td>0.291120</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>supermarkets_count</td>\n",
+ " <td>0.221202</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>Gesamtbetrag Einkünfte Median - [€]</td>\n",
+ " <td>0.122629</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>Anteil der Sozial-wohnungen an allen Wohnungen</td>\n",
+ " <td>0.111979</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>distance_rathaus</td>\n",
+ " <td>0.104781</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5</th>\n",
+ " <td>Einwohner: innen je km²</td>\n",
+ " <td>0.070611</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>6</th>\n",
+ " <td>greencrocers_count</td>\n",
+ " <td>0.043105</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>7</th>\n",
+ " <td>biosupermarkets_count</td>\n",
+ " <td>0.019531</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>8</th>\n",
+ " <td>vegan_restaurants_count</td>\n",
+ " <td>0.007623</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>9</th>\n",
+ " <td>organic_restaurants_count</td>\n",
+ " <td>0.007419</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " Feature Mean_Importance\n",
+ "0 all_restaurants_count 0.291120\n",
+ "1 supermarkets_count 0.221202\n",
+ "2 Gesamtbetrag Einkünfte Median - [€] 0.122629\n",
+ "3 Anteil der Sozial-wohnungen an allen Wohnungen 0.111979\n",
+ "4 distance_rathaus 0.104781\n",
+ "5 Einwohner: innen je km² 0.070611\n",
+ "6 greencrocers_count 0.043105\n",
+ "7 biosupermarkets_count 0.019531\n",
+ "8 vegan_restaurants_count 0.007623\n",
+ "9 organic_restaurants_count 0.007419"
+ ]
+ },
+ "execution_count": 65,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#----------------Make all importance results into dataframes----------------------------\n",
+ "\n",
+ "rf_feature_mean_importances_df = pd.DataFrame(rf_feature_mean_importances_sorted, columns=['Mean_Importance'])\n",
+ "rf_feature_mean_importances_df.reset_index(inplace=True)\n",
+ "rf_feature_mean_importances_df.rename(columns = {'index' : 'Feature'}, inplace = True)\n",
+ "rf_feature_mean_importances_df\n",
+ "\n",
+ "gbt_feature_mean_importances_df = pd.DataFrame(gbt_feature_mean_importances_sorted, columns=['Mean_Importance'])\n",
+ "gbt_feature_mean_importances_df.reset_index(inplace=True)\n",
+ "gbt_feature_mean_importances_df.rename(columns = {'index' : 'Feature'}, inplace = True)\n",
+ "gbt_feature_mean_importances_df\n",
+ "\n",
+ "rfe1_feature_mean_importances_df = pd.DataFrame(rfe1_feature_mean_importances_sorted, columns=['Mean_Importance'])\n",
+ "rfe1_feature_mean_importances_df.reset_index(inplace=True)\n",
+ "rfe1_feature_mean_importances_df.rename(columns = {'index' : 'Feature'}, inplace = True)\n",
+ "rfe1_feature_mean_importances_df\n",
+ "\n",
+ "rfe2_feature_mean_importances_df = pd.DataFrame(rfe2_feature_mean_importances_sorted, columns=['Mean_Importance'])\n",
+ "rfe2_feature_mean_importances_df.reset_index(inplace=True)\n",
+ "rfe2_feature_mean_importances_df.rename(columns = {'index' : 'Feature'}, inplace = True)\n",
+ "rfe2_feature_mean_importances_df\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>RF_Feature</th>\n",
+ " <th>RF_avg_Importance</th>\n",
+ " <th>GBT_Feature</th>\n",
+ " <th>GBT_avg_Importance</th>\n",
+ " <th>RFE_RF_Feature</th>\n",
+ " <th>RFE_RF_avg_Importance</th>\n",
+ " <th>RFE_GBT_Feature</th>\n",
+ " <th>RFE_GBT_avg_Importance</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>all_restaurants_count</td>\n",
+ " <td>0.219926</td>\n",
+ " <td>all_restaurants_count</td>\n",
+ " <td>0.291120</td>\n",
+ " <td>all_restaurants_count</td>\n",
+ " <td>0.219926</td>\n",
+ " <td>all_restaurants_count</td>\n",
+ " <td>0.291120</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>supermarkets_count</td>\n",
+ " <td>0.186518</td>\n",
+ " <td>supermarkets_count</td>\n",
+ " <td>0.221202</td>\n",
+ " <td>supermarkets_count</td>\n",
+ " <td>0.186518</td>\n",
+ " <td>supermarkets_count</td>\n",
+ " <td>0.221202</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>Anteil der Sozial-wohnungen an allen Wohnungen</td>\n",
+ " <td>0.119212</td>\n",
+ " <td>Gesamtbetrag Einkünfte Median - [€]</td>\n",
+ " <td>0.122629</td>\n",
+ " <td>Anteil der Sozial-wohnungen an allen Wohnungen</td>\n",
+ " <td>0.119212</td>\n",
+ " <td>Gesamtbetrag Einkünfte Median - [€]</td>\n",
+ " <td>0.122629</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>Einwohner: innen je km²</td>\n",
+ " <td>0.118131</td>\n",
+ " <td>Anteil der Sozial-wohnungen an allen Wohnungen</td>\n",
+ " <td>0.111979</td>\n",
+ " <td>Einwohner: innen je km²</td>\n",
+ " <td>0.118131</td>\n",
+ " <td>Anteil der Sozial-wohnungen an allen Wohnungen</td>\n",
+ " <td>0.111979</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>Gesamtbetrag Einkünfte Median - [€]</td>\n",
+ " <td>0.114430</td>\n",
+ " <td>distance_rathaus</td>\n",
+ " <td>0.104781</td>\n",
+ " <td>Gesamtbetrag Einkünfte Median - [€]</td>\n",
+ " <td>0.114430</td>\n",
+ " <td>distance_rathaus</td>\n",
+ " <td>0.104781</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5</th>\n",
+ " <td>distance_rathaus</td>\n",
+ " <td>0.108481</td>\n",
+ " <td>Einwohner: innen je km²</td>\n",
+ " <td>0.070611</td>\n",
+ " <td>distance_rathaus</td>\n",
+ " <td>0.108481</td>\n",
+ " <td>Einwohner: innen je km²</td>\n",
+ " <td>0.070611</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>6</th>\n",
+ " <td>greencrocers_count</td>\n",
+ " <td>0.065882</td>\n",
+ " <td>greencrocers_count</td>\n",
+ " <td>0.043105</td>\n",
+ " <td>greencrocers_count</td>\n",
+ " <td>0.065882</td>\n",
+ " <td>greencrocers_count</td>\n",
+ " <td>0.043105</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>7</th>\n",
+ " <td>biosupermarkets_count</td>\n",
+ " <td>0.033396</td>\n",
+ " <td>biosupermarkets_count</td>\n",
+ " <td>0.019531</td>\n",
+ " <td>biosupermarkets_count</td>\n",
+ " <td>0.033396</td>\n",
+ " <td>biosupermarkets_count</td>\n",
+ " <td>0.019531</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>8</th>\n",
+ " <td>vegan_restaurants_count</td>\n",
+ " <td>0.024088</td>\n",
+ " <td>vegan_restaurants_count</td>\n",
+ " <td>0.007623</td>\n",
+ " <td>vegan_restaurants_count</td>\n",
+ " <td>0.024088</td>\n",
+ " <td>vegan_restaurants_count</td>\n",
+ " <td>0.007623</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>9</th>\n",
+ " <td>organic_restaurants_count</td>\n",
+ " <td>0.009936</td>\n",
+ " <td>organic_restaurants_count</td>\n",
+ " <td>0.007419</td>\n",
+ " <td>organic_restaurants_count</td>\n",
+ " <td>0.009936</td>\n",
+ " <td>organic_restaurants_count</td>\n",
+ " <td>0.007419</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " RF_Feature RF_avg_Importance \\\n",
+ "0 all_restaurants_count 0.219926 \n",
+ "1 supermarkets_count 0.186518 \n",
+ "2 Anteil der Sozial-wohnungen an allen Wohnungen 0.119212 \n",
+ "3 Einwohner: innen je km² 0.118131 \n",
+ "4 Gesamtbetrag Einkünfte Median - [€] 0.114430 \n",
+ "5 distance_rathaus 0.108481 \n",
+ "6 greencrocers_count 0.065882 \n",
+ "7 biosupermarkets_count 0.033396 \n",
+ "8 vegan_restaurants_count 0.024088 \n",
+ "9 organic_restaurants_count 0.009936 \n",
+ "\n",
+ " GBT_Feature GBT_avg_Importance \\\n",
+ "0 all_restaurants_count 0.291120 \n",
+ "1 supermarkets_count 0.221202 \n",
+ "2 Gesamtbetrag Einkünfte Median - [€] 0.122629 \n",
+ "3 Anteil der Sozial-wohnungen an allen Wohnungen 0.111979 \n",
+ "4 distance_rathaus 0.104781 \n",
+ "5 Einwohner: innen je km² 0.070611 \n",
+ "6 greencrocers_count 0.043105 \n",
+ "7 biosupermarkets_count 0.019531 \n",
+ "8 vegan_restaurants_count 0.007623 \n",
+ "9 organic_restaurants_count 0.007419 \n",
+ "\n",
+ " RFE_RF_Feature RFE_RF_avg_Importance \\\n",
+ "0 all_restaurants_count 0.219926 \n",
+ "1 supermarkets_count 0.186518 \n",
+ "2 Anteil der Sozial-wohnungen an allen Wohnungen 0.119212 \n",
+ "3 Einwohner: innen je km² 0.118131 \n",
+ "4 Gesamtbetrag Einkünfte Median - [€] 0.114430 \n",
+ "5 distance_rathaus 0.108481 \n",
+ "6 greencrocers_count 0.065882 \n",
+ "7 biosupermarkets_count 0.033396 \n",
+ "8 vegan_restaurants_count 0.024088 \n",
+ "9 organic_restaurants_count 0.009936 \n",
+ "\n",
+ " RFE_GBT_Feature RFE_GBT_avg_Importance \n",
+ "0 all_restaurants_count 0.291120 \n",
+ "1 supermarkets_count 0.221202 \n",
+ "2 Gesamtbetrag Einkünfte Median - [€] 0.122629 \n",
+ "3 Anteil der Sozial-wohnungen an allen Wohnungen 0.111979 \n",
+ "4 distance_rathaus 0.104781 \n",
+ "5 Einwohner: innen je km² 0.070611 \n",
+ "6 greencrocers_count 0.043105 \n",
+ "7 biosupermarkets_count 0.019531 \n",
+ "8 vegan_restaurants_count 0.007623 \n",
+ "9 organic_restaurants_count 0.007419 "
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "#Make all the avg_importance results in one dataframe\n",
+ "\n",
+ "df1 = rf_feature_mean_importances_df\n",
+ "df1.rename(columns = {'Feature' : 'RF_Feature', 'Mean_Importance' : 'RF_avg_Importance'}, inplace = True)\n",
+ "\n",
+ "df2 = gbt_feature_mean_importances_df\n",
+ "df2.rename(columns = {'Feature' : 'GBT_Feature', 'Mean_Importance' : 'GBT_avg_Importance'}, inplace = True)\n",
+ "\n",
+ "df3 = rfe1_feature_mean_importances_df\n",
+ "df3.rename(columns = {'Feature' : 'RFE_RF_Feature', 'Mean_Importance' : 'RFE_RF_avg_Importance'}, inplace = True)\n",
+ "\n",
+ "df4 = rfe2_feature_mean_importances_df\n",
+ "df4.rename(columns = {'Feature' : 'RFE_GBT_Feature', 'Mean_Importance' : 'RFE_GBT_avg_Importance'}, inplace = True)\n",
+ "\n",
+ "# save all 4 dataframes in a list\n",
+ "dfs = [df1, df2, df3, df4]\n",
+ "\n",
+ "# merge all 4 dataframes\n",
+ "var_selections_df = pd.concat(dfs, axis=1, ignore_index=False)\n",
+ "\n",
+ "# put index (1,10)\n",
+ "#var_selections_df.index = range(1, 11)\n",
+ "var_selections_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>RF_Feature</th>\n",
+ " <th>RF_avg_Importance</th>\n",
+ " <th>GBT_Feature</th>\n",
+ " <th>GBT_avg_Importance</th>\n",
+ " <th>RFE_RF_Feature</th>\n",
+ " <th>RFE_RF_avg_Importance</th>\n",
+ " <th>RFE_GBT_Feature</th>\n",
+ " <th>RFE_GBT_avg_Importance</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>RF_avg_Accuracy</td>\n",
+ " <td>0.614500</td>\n",
+ " <td>GBT_avg_Accuracy</td>\n",
+ " <td>0.619500</td>\n",
+ " <td>RFE_RF_avg_Accuracy</td>\n",
+ " <td>0.614500</td>\n",
+ " <td>RFE_GBT_avg_Accuracy</td>\n",
+ " <td>0.619500</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>all_restaurants_count</td>\n",
+ " <td>0.219926</td>\n",
+ " <td>all_restaurants_count</td>\n",
+ " <td>0.291120</td>\n",
+ " <td>all_restaurants_count</td>\n",
+ " <td>0.219926</td>\n",
+ " <td>all_restaurants_count</td>\n",
+ " <td>0.291120</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>supermarkets_count</td>\n",
+ " <td>0.186518</td>\n",
+ " <td>supermarkets_count</td>\n",
+ " <td>0.221202</td>\n",
+ " <td>supermarkets_count</td>\n",
+ " <td>0.186518</td>\n",
+ " <td>supermarkets_count</td>\n",
+ " <td>0.221202</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>Anteil der Sozial-wohnungen an allen Wohnungen</td>\n",
+ " <td>0.119212</td>\n",
+ " <td>Gesamtbetrag Einkünfte Median - [€]</td>\n",
+ " <td>0.122629</td>\n",
+ " <td>Anteil der Sozial-wohnungen an allen Wohnungen</td>\n",
+ " <td>0.119212</td>\n",
+ " <td>Gesamtbetrag Einkünfte Median - [€]</td>\n",
+ " <td>0.122629</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>Einwohner: innen je km²</td>\n",
+ " <td>0.118131</td>\n",
+ " <td>Anteil der Sozial-wohnungen an allen Wohnungen</td>\n",
+ " <td>0.111979</td>\n",
+ " <td>Einwohner: innen je km²</td>\n",
+ " <td>0.118131</td>\n",
+ " <td>Anteil der Sozial-wohnungen an allen Wohnungen</td>\n",
+ " <td>0.111979</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>Gesamtbetrag Einkünfte Median - [€]</td>\n",
+ " <td>0.114430</td>\n",
+ " <td>distance_rathaus</td>\n",
+ " <td>0.104781</td>\n",
+ " <td>Gesamtbetrag Einkünfte Median - [€]</td>\n",
+ " <td>0.114430</td>\n",
+ " <td>distance_rathaus</td>\n",
+ " <td>0.104781</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5</th>\n",
+ " <td>distance_rathaus</td>\n",
+ " <td>0.108481</td>\n",
+ " <td>Einwohner: innen je km²</td>\n",
+ " <td>0.070611</td>\n",
+ " <td>distance_rathaus</td>\n",
+ " <td>0.108481</td>\n",
+ " <td>Einwohner: innen je km²</td>\n",
+ " <td>0.070611</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>6</th>\n",
+ " <td>greencrocers_count</td>\n",
+ " <td>0.065882</td>\n",
+ " <td>greencrocers_count</td>\n",
+ " <td>0.043105</td>\n",
+ " <td>greencrocers_count</td>\n",
+ " <td>0.065882</td>\n",
+ " <td>greencrocers_count</td>\n",
+ " <td>0.043105</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>7</th>\n",
+ " <td>biosupermarkets_count</td>\n",
+ " <td>0.033396</td>\n",
+ " <td>biosupermarkets_count</td>\n",
+ " <td>0.019531</td>\n",
+ " <td>biosupermarkets_count</td>\n",
+ " <td>0.033396</td>\n",
+ " <td>biosupermarkets_count</td>\n",
+ " <td>0.019531</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>8</th>\n",
+ " <td>vegan_restaurants_count</td>\n",
+ " <td>0.024088</td>\n",
+ " <td>vegan_restaurants_count</td>\n",
+ " <td>0.007623</td>\n",
+ " <td>vegan_restaurants_count</td>\n",
+ " <td>0.024088</td>\n",
+ " <td>vegan_restaurants_count</td>\n",
+ " <td>0.007623</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>9</th>\n",
+ " <td>organic_restaurants_count</td>\n",
+ " <td>0.009936</td>\n",
+ " <td>organic_restaurants_count</td>\n",
+ " <td>0.007419</td>\n",
+ " <td>organic_restaurants_count</td>\n",
+ " <td>0.009936</td>\n",
+ " <td>organic_restaurants_count</td>\n",
+ " <td>0.007419</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " RF_Feature RF_avg_Importance \\\n",
+ "0 RF_avg_Accuracy 0.614500 \n",
+ "0 all_restaurants_count 0.219926 \n",
+ "1 supermarkets_count 0.186518 \n",
+ "2 Anteil der Sozial-wohnungen an allen Wohnungen 0.119212 \n",
+ "3 Einwohner: innen je km² 0.118131 \n",
+ "4 Gesamtbetrag Einkünfte Median - [€] 0.114430 \n",
+ "5 distance_rathaus 0.108481 \n",
+ "6 greencrocers_count 0.065882 \n",
+ "7 biosupermarkets_count 0.033396 \n",
+ "8 vegan_restaurants_count 0.024088 \n",
+ "9 organic_restaurants_count 0.009936 \n",
+ "\n",
+ " GBT_Feature GBT_avg_Importance \\\n",
+ "0 GBT_avg_Accuracy 0.619500 \n",
+ "0 all_restaurants_count 0.291120 \n",
+ "1 supermarkets_count 0.221202 \n",
+ "2 Gesamtbetrag Einkünfte Median - [€] 0.122629 \n",
+ "3 Anteil der Sozial-wohnungen an allen Wohnungen 0.111979 \n",
+ "4 distance_rathaus 0.104781 \n",
+ "5 Einwohner: innen je km² 0.070611 \n",
+ "6 greencrocers_count 0.043105 \n",
+ "7 biosupermarkets_count 0.019531 \n",
+ "8 vegan_restaurants_count 0.007623 \n",
+ "9 organic_restaurants_count 0.007419 \n",
+ "\n",
+ " RFE_RF_Feature RFE_RF_avg_Importance \\\n",
+ "0 RFE_RF_avg_Accuracy 0.614500 \n",
+ "0 all_restaurants_count 0.219926 \n",
+ "1 supermarkets_count 0.186518 \n",
+ "2 Anteil der Sozial-wohnungen an allen Wohnungen 0.119212 \n",
+ "3 Einwohner: innen je km² 0.118131 \n",
+ "4 Gesamtbetrag Einkünfte Median - [€] 0.114430 \n",
+ "5 distance_rathaus 0.108481 \n",
+ "6 greencrocers_count 0.065882 \n",
+ "7 biosupermarkets_count 0.033396 \n",
+ "8 vegan_restaurants_count 0.024088 \n",
+ "9 organic_restaurants_count 0.009936 \n",
+ "\n",
+ " RFE_GBT_Feature RFE_GBT_avg_Importance \n",
+ "0 RFE_GBT_avg_Accuracy 0.619500 \n",
+ "0 all_restaurants_count 0.291120 \n",
+ "1 supermarkets_count 0.221202 \n",
+ "2 Gesamtbetrag Einkünfte Median - [€] 0.122629 \n",
+ "3 Anteil der Sozial-wohnungen an allen Wohnungen 0.111979 \n",
+ "4 distance_rathaus 0.104781 \n",
+ "5 Einwohner: innen je km² 0.070611 \n",
+ "6 greencrocers_count 0.043105 \n",
+ "7 biosupermarkets_count 0.019531 \n",
+ "8 vegan_restaurants_count 0.007623 \n",
+ "9 organic_restaurants_count 0.007419 "
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#----------------------------------------Add Accuracy of 4 models to the importance result----------------------------\n",
+ "#var_selections_df\n",
+ "# Create a new row with 4 model's accuracy to append to var_selections_df\n",
+ "accuracy_row = {'RF_Feature': 'RF_avg_Accuracy', 'RF_avg_Importance': rf_avg_accuracy, \n",
+ " 'GBT_Feature': 'GBT_avg_Accuracy', 'GBT_avg_Importance': gbt_avg_accuracy, \n",
+ " 'RFE_RF_Feature': 'RFE_RF_avg_Accuracy', 'RFE_RF_avg_Importance': rfe1_avg_accuracy, \n",
+ " 'RFE_GBT_Feature': 'RFE_GBT_avg_Accuracy', 'RFE_GBT_avg_Importance':rfe2_avg_accuracy}\n",
+ "\n",
+ "# Convert the Accuracy row to a DataFrame\n",
+ "accuracy_row_df = pd.DataFrame(accuracy_row, index=[0])\n",
+ "accuracy_row_df\n",
+ "# Concatenate the new row DataFrame with var_selections_df\n",
+ "var_accuracy_df = pd.concat([ accuracy_row_df, var_selections_df])\n",
+ "\n",
+ "var_accuracy_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#------------------------Save the result as a csv file------------------------\n",
+ "\n",
+ "\n",
+ "# save as CSV file\n",
+ "#var_accuracy_df.to_csv('Result of variable selections for weekendmarkets(of all 4 methods with model accuracy).csv')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--
GitLab