diff --git a/Correlation/Corr_P_VIF.py b/Correlation/Corr_P_VIF.py deleted file mode 100644 index 2ad065902e747e9d20f607cdb5d22bd71b0b5347..0000000000000000000000000000000000000000 --- a/Correlation/Corr_P_VIF.py +++ /dev/null @@ -1,82 +0,0 @@ -import pandas as pd -import matplotlib.pyplot as plt - -filepath = "emerged.xlsx" -df = pd.read_excel(filepath) -#%% - -#set the first coulumn as index -df = df.set_index(df.columns[0]) -#%% -#Data cleaning - -for column in df.columns: - # convert the data type of the column to numeric or NaN - df[column] = pd.to_numeric(df[column], errors='coerce') - - # for numeric columns, fill non-numeric or null values as the column average - #fillna: fill the null area - if pd.api.types.is_numeric_dtype(df[column]): - df[column] = df[column].fillna(df[column].mean()) - else: - # For non-numeric columns, fill the null value with the column average - df[column] = df[column].fillna(df[column].mean()) - - -# Computed correlation & P-value -from scipy.stats import pearsonr - - -#making function for correlation & P-value -def calculate_correlation_and_vif(df): - columns = df.columns - correlation_results = [] - - for i in range(len(columns)): - for j in range(i + 1, len(columns)): - column1 = columns[i] - column2 = columns[j] - - # correlation & P-value - correlation_coefficient, p_value = pearsonr(df[column1], df[column2]) - - # save the result - result = { - 'Column 1': column1, - 'Column 2': column2, - 'Correlation Coefficient': correlation_coefficient, - 'P-Value': p_value - } - correlation_results.append(result) - - return pd.DataFrame(correlation_results) - - - -# Compute correlation & p-value -corr_df = calculate_correlation_and_vif(df) -corr_df = corr_df.set_index([corr_df.columns[0], corr_df.columns[1]]) -print(corr_df) - -#compute VIF -from statsmodels.stats.outliers_influence import variance_inflation_factor - -# define VIF function -def calculate_vif(data_frame): - variables = data_frame.columns - vif_data = pd.DataFrame() - vif_data["Variable"] = variables - vif_data["VIF"] = [variance_inflation_factor(data_frame.values, i) for i in range(data_frame.shape[1])] - return vif_data - -# Compute VIF -vif_result = calculate_vif(df) -#print(vif_result) - -# making the reult in an excel file -excel_file_path = 'correlation_pvalue_VIF.xlsx' - -with pd.ExcelWriter(excel_file_path, engine='xlsxwriter') as writer: - # each dataframe in different sheet - corr_df.to_excel(writer, sheet_name='Sheet1', index=True) - vif_result.to_excel(writer, sheet_name='Sheet2', index=False) \ No newline at end of file