Delete Corr_P_VIF.py

010b75a6 · Oh, Sojung · 98cc620e · 98cc620e
Commit 010b75a6 authored Apr 10, 2024 by Oh, Sojung
--- a/Correlation/Corr_P_VIF.py
+++ b/Correlation/Corr_P_VIF.py
-import pandas as pd
-import matplotlib.pyplot as plt
-
-filepath = "emerged.xlsx"
-df = pd.read_excel(filepath)
-#%%
-
-#set the first coulumn as index
-df = df.set_index(df.columns[0])
-#%%
-#Data cleaning
-
-for column in df.columns:
-    # convert the data type of the column to numeric or NaN
-    df[column] = pd.to_numeric(df[column], errors='coerce')
-
-    # for numeric columns, fill non-numeric or null values as the column average
-    #fillna: fill the null area
-    if pd.api.types.is_numeric_dtype(df[column]):
-        df[column] = df[column].fillna(df[column].mean())
-    else:
-        # For non-numeric columns, fill the null value with the column average
-        df[column] = df[column].fillna(df[column].mean())
-
-
-# Computed correlation & P-value 
-from scipy.stats import pearsonr
-
-
-#making function for correlation & P-value 
-def calculate_correlation_and_vif(df):
-    columns = df.columns
-    correlation_results = []
-
-    for i in range(len(columns)):
-        for j in range(i + 1, len(columns)):
-            column1 = columns[i]
-            column2 = columns[j]
-
-            # correlation & P-value
-            correlation_coefficient, p_value = pearsonr(df[column1], df[column2])
-
-            # save the result
-            result = {
-                'Column 1': column1,
-                'Column 2': column2,
-                'Correlation Coefficient': correlation_coefficient,
-                'P-Value': p_value
-            }
-            correlation_results.append(result)
-
-    return pd.DataFrame(correlation_results)
-
-
-
-# Compute correlation & p-value 
-corr_df = calculate_correlation_and_vif(df)
-corr_df = corr_df.set_index([corr_df.columns[0], corr_df.columns[1]])
-print(corr_df)
-
-#compute VIF
-from statsmodels.stats.outliers_influence import variance_inflation_factor
-
-# define VIF function
-def calculate_vif(data_frame):
-    variables = data_frame.columns
-    vif_data = pd.DataFrame()
-    vif_data["Variable"] = variables
-    vif_data["VIF"] = [variance_inflation_factor(data_frame.values, i) for i in range(data_frame.shape[1])]
-    return vif_data
-
-# Compute VIF
-vif_result = calculate_vif(df)
-#print(vif_result)
-
-# making the reult in an excel file
-excel_file_path = 'correlation_pvalue_VIF.xlsx'
-
-with pd.ExcelWriter(excel_file_path, engine='xlsxwriter') as writer:
-    # each dataframe in different sheet
-    corr_df.to_excel(writer, sheet_name='Sheet1', index=True)
-    vif_result.to_excel(writer, sheet_name='Sheet2', index=False)
\ No newline at end of file