2024.12.16
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import statsmodels.api as sm
np.random.seed(42)
# Generate correlated X and Y data
n_samples = 1000
mean = [0, 0]
cov = [[1, 0.7],
[0.7, 1]]
X, Y = np.random.multivariate_normal(mean, cov, n_samples).T
# Store original data
X_orig = X.copy()
Y_orig = Y.copy()
# Add noise term distributed as N(1,1)
noise = np.random.normal(1, 1, n_samples)
Y = Y + noise
# Increment X values by 1
X = X + 1
plt.figure(figsize=(10,6))
plt.scatter(X, Y, alpha=0.5)
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Synthetic Data with Correlation and Noise')
plt.grid(True)
plt.show()
X_no_demeaning = sm.add_constant(X.copy())
Y_no_demeaning = Y.copy()
model_no_demeaning = sm.OLS(Y_no_demeaning, X_no_demeaning).fit()
model_no_demeaning.summary()
Dep. Variable: | y | R-squared: | 0.205 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.204 |
Method: | Least Squares | F-statistic: | 256.8 |
Date: | Mon, 16 Dec 2024 | Prob (F-statistic): | 1.32e-51 |
Time: | 16:52:39 | Log-Likelihood: | -1617.5 |
No. Observations: | 1000 | AIC: | 3239. |
Df Residuals: | 998 | BIC: | 3249. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 0.3932 | 0.054 | 7.287 | 0.000 | 0.287 | 0.499 |
x1 | 0.6377 | 0.040 | 16.025 | 0.000 | 0.560 | 0.716 |
Omnibus: | 0.915 | Durbin-Watson: | 1.969 |
---|---|---|---|
Prob(Omnibus): | 0.633 | Jarque-Bera (JB): | 0.790 |
Skew: | 0.012 | Prob(JB): | 0.674 |
Kurtosis: | 3.136 | Cond. No. | 2.53 |
X_demeaned = sm.add_constant(X - X.mean())
y_demeaned = Y.copy()
model_demeaned = sm.OLS(y_demeaned, X_demeaned).fit()
model_demeaned.summary()
Dep. Variable: | y | R-squared: | 0.205 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.204 |
Method: | Least Squares | F-statistic: | 256.8 |
Date: | Mon, 16 Dec 2024 | Prob (F-statistic): | 1.32e-51 |
Time: | 16:52:41 | Log-Likelihood: | -1617.5 |
No. Observations: | 1000 | AIC: | 3239. |
Df Residuals: | 998 | BIC: | 3249. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 0.9973 | 0.039 | 25.832 | 0.000 | 0.922 | 1.073 |
x1 | 0.6377 | 0.040 | 16.025 | 0.000 | 0.560 | 0.716 |
Omnibus: | 0.915 | Durbin-Watson: | 1.969 |
---|---|---|---|
Prob(Omnibus): | 0.633 | Jarque-Bera (JB): | 0.790 |
Skew: | 0.012 | Prob(JB): | 0.674 |
Kurtosis: | 3.136 | Cond. No. | 1.03 |
epsilon_X = np.random.normal(1, 1, n_samples)
X_noised_no_demeaning = sm.add_constant(X_orig + epsilon_X)
y_noised_no_demeaning = Y.copy()
model_noised_no_demeaning = sm.OLS(y_noised_no_demeaning, X_noised_no_demeaning).fit()
model_noised_no_demeaning.summary()
Dep. Variable: | y | R-squared: | 0.110 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.109 |
Method: | Least Squares | F-statistic: | 122.7 |
Date: | Mon, 16 Dec 2024 | Prob (F-statistic): | 5.51e-27 |
Time: | 17:00:38 | Log-Likelihood: | -1674.0 |
No. Observations: | 1000 | AIC: | 3352. |
Df Residuals: | 998 | BIC: | 3362. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 0.7050 | 0.049 | 14.497 | 0.000 | 0.610 | 0.800 |
x1 | 0.3255 | 0.029 | 11.079 | 0.000 | 0.268 | 0.383 |
Omnibus: | 2.965 | Durbin-Watson: | 1.905 |
---|---|---|---|
Prob(Omnibus): | 0.227 | Jarque-Bera (JB): | 3.103 |
Skew: | -0.053 | Prob(JB): | 0.212 |
Kurtosis: | 3.251 | Cond. No. | 2.24 |
X_noised_demeaned = sm.add_constant(X_noised_no_demeaning[:, 1] - X_noised_no_demeaning[:, 1].mean())
y_noised_demeaned = Y.copy()
model_noised_demeaned = sm.OLS(y_noised_demeaned, X_noised_demeaned).fit()
model_noised_demeaned.summary()
Dep. Variable: | y | R-squared: | 0.110 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.109 |
Method: | Least Squares | F-statistic: | 122.7 |
Date: | Mon, 16 Dec 2024 | Prob (F-statistic): | 5.51e-27 |
Time: | 17:00:38 | Log-Likelihood: | -1674.0 |
No. Observations: | 1000 | AIC: | 3352. |
Df Residuals: | 998 | BIC: | 3362. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 0.9973 | 0.041 | 24.413 | 0.000 | 0.917 | 1.077 |
x1 | 0.3255 | 0.029 | 11.079 | 0.000 | 0.268 | 0.383 |
Omnibus: | 2.965 | Durbin-Watson: | 1.905 |
---|---|---|---|
Prob(Omnibus): | 0.227 | Jarque-Bera (JB): | 3.103 |
Skew: | -0.053 | Prob(JB): | 0.212 |
Kurtosis: | 3.251 | Cond. No. | 1.39 |