Skip to content
Snippets Groups Projects
Commit 010b75a6 authored by Oh, Sojung's avatar Oh, Sojung
Browse files

Delete Corr_P_VIF.py

parent 98cc620e
No related branches found
No related tags found
No related merge requests found
import pandas as pd
import matplotlib.pyplot as plt
filepath = "emerged.xlsx"
df = pd.read_excel(filepath)
#%%
#set the first coulumn as index
df = df.set_index(df.columns[0])
#%%
#Data cleaning
for column in df.columns:
# convert the data type of the column to numeric or NaN
df[column] = pd.to_numeric(df[column], errors='coerce')
# for numeric columns, fill non-numeric or null values as the column average
#fillna: fill the null area
if pd.api.types.is_numeric_dtype(df[column]):
df[column] = df[column].fillna(df[column].mean())
else:
# For non-numeric columns, fill the null value with the column average
df[column] = df[column].fillna(df[column].mean())
# Computed correlation & P-value
from scipy.stats import pearsonr
#making function for correlation & P-value
def calculate_correlation_and_vif(df):
columns = df.columns
correlation_results = []
for i in range(len(columns)):
for j in range(i + 1, len(columns)):
column1 = columns[i]
column2 = columns[j]
# correlation & P-value
correlation_coefficient, p_value = pearsonr(df[column1], df[column2])
# save the result
result = {
'Column 1': column1,
'Column 2': column2,
'Correlation Coefficient': correlation_coefficient,
'P-Value': p_value
}
correlation_results.append(result)
return pd.DataFrame(correlation_results)
# Compute correlation & p-value
corr_df = calculate_correlation_and_vif(df)
corr_df = corr_df.set_index([corr_df.columns[0], corr_df.columns[1]])
print(corr_df)
#compute VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
# define VIF function
def calculate_vif(data_frame):
variables = data_frame.columns
vif_data = pd.DataFrame()
vif_data["Variable"] = variables
vif_data["VIF"] = [variance_inflation_factor(data_frame.values, i) for i in range(data_frame.shape[1])]
return vif_data
# Compute VIF
vif_result = calculate_vif(df)
#print(vif_result)
# making the reult in an excel file
excel_file_path = 'correlation_pvalue_VIF.xlsx'
with pd.ExcelWriter(excel_file_path, engine='xlsxwriter') as writer:
# each dataframe in different sheet
corr_df.to_excel(writer, sheet_name='Sheet1', index=True)
vif_result.to_excel(writer, sheet_name='Sheet2', index=False)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment