import numpy as np
from math import sqrt

# 样本：某班成绩
scores = [85, 87, 90, 83, 88]

# 优先尝试 scipy，如不可用则提供简化实现
try:
    from scipy import stats
    t_stat, p_value = stats.ttest_1samp(scores, 85)
except Exception as e:
    # 手动实现（双侧）
    x = np.array(scores, dtype=float)
    n = x.size
    mean = x.mean()
    std = x.std(ddof=1)
    t_stat = (mean - 85) / (std / sqrt(n))
    # 使用正态近似的 p 值（近似），仅作兜底；更推荐安装 scipy 获取精确 t 分布 p 值
    # 这里不实现精确 t 分布 CDF（超出入门范围）
    from math import erf
    # 正态近似双侧 p 值
    p_value = 2*(1-0.5*(1+erf(abs(t_stat)/sqrt(2))))

print("t统计量:", round(t_stat, 2))
print("p值:", round(p_value, 4))

t统计量: 1.32
p值: 0.256


import numpy as np

x = np.array([85, 87, 90, 83, 88], dtype=float)
mean = np.mean(x)
std = np.std(x, ddof=1)
n = len(x)

confidence = 0.95

try:
    from scipy import stats
    t_critical = stats.t.ppf((1 + confidence) / 2, df=n-1)
except Exception as e:
    # 兜底：用正态近似的 1.96（小样本时会有偏差，建议安装 scipy）
    t_critical = 1.96

margin_error = t_critical * (std / np.sqrt(n))
ci_lower = mean - margin_error
ci_upper = mean + margin_error

print("样本均值:", round(mean, 2))
print("95%置信区间:", (round(ci_lower, 2), round(ci_upper, 2)))

样本均值: 86.6
95%置信区间: (83.25, 89.95)


import numpy as np

# 构造数据（例：学习小时数 X → 成绩 Y）
X_raw = np.array([1, 2, 3, 4, 5], dtype=float)
Y = np.array([2.5, 3.1, 4.2, 4.8, 5.5], dtype=float)

try:
    import statsmodels.api as sm
    X = sm.add_constant(X_raw)  # 添加常数项
    model = sm.OLS(Y, X).fit()
    print(model.summary())
except Exception as e:
    # 若未安装 statsmodels，则用最小二乘的简要实现与结果展示
    X = np.column_stack([np.ones_like(X_raw), X_raw])  # [1, X]
    beta_hat, *_ = np.linalg.lstsq(X, Y, rcond=None)   # [beta0, beta1]
    y_hat = X @ beta_hat
    resid = Y - y_hat
    rss = float((resid**2).sum())
    tss = float(((Y - Y.mean())**2).sum())
    r2 = 1 - rss/tss
    print("（简版 OLS 输出 - 建议安装 statsmodels 获取完整报告）")
    print(f"截距 β0 ≈ {beta_hat[0]:.4f},  斜率 β1 ≈ {beta_hat[1]:.4f}")
    print(f"R² ≈ {r2:.4f}")

C:\Users\Zhouq\AppData\Roaming\Python\Python39\site-packages\pandas\core\computation\expressions.py:21: UserWarning: Pandas requires version '2.8.4' or newer of 'numexpr' (version '2.8.3' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED
C:\Users\Zhouq\AppData\Roaming\Python\Python39\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.990
Model:                            OLS   Adj. R-squared:                  0.987
Method:                 Least Squares   F-statistic:                     301.5
Date:                Tue, 12 Aug 2025   Prob (F-statistic):           0.000416
Time:                        10:17:53   Log-Likelihood:                 4.0044
No. Observations:                   5   AIC:                            -4.009
Df Residuals:                       3   BIC:                            -4.790
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.7100      0.147     11.626      0.001       1.242       2.178
x1             0.7700      0.044     17.363      0.000       0.629       0.911
==============================================================================
Omnibus:                          nan   Durbin-Watson:                   2.908
Prob(Omnibus):                    nan   Jarque-Bera (JB):                0.219
Skew:                           0.351   Prob(JB):                        0.896
Kurtosis:                       2.254   Cond. No.                         8.37
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

C:\Users\Zhouq\AppData\Roaming\Python\Python39\site-packages\statsmodels\stats\stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given.
  warn("omni_normtest is not valid with less than 8 observations; %i "


A = np.array([82, 85, 88, 90, 91], dtype=float)   # 教学法A
B = np.array([78, 80, 84, 86, 87], dtype=float)   # 教学法B

try:
    from scipy import stats
    t2, p2 = stats.ttest_ind(A, B, equal_var=False)  # Welch 修正（推荐）
except Exception as e:
    # 兜底：用正态近似计算，结果仅供参考
    n1, n2 = len(A), len(B)
    mean1, mean2 = A.mean(), B.mean()
    s1, s2 = A.std(ddof=1), B.std(ddof=1)
    se = np.sqrt(s1**2/n1 + s2**2/n2)
    t2 = (mean1 - mean2)/se
    # 正态近似的双侧 p 值
    from math import erf, sqrt
    p2 = 2*(1-0.5*(1+erf(abs(t2)/sqrt(2))))

print("t统计量:", round(t2,2), "  p值:", round(p2,4))
print("A组均值:", round(A.mean(),2), " B组均值:", round(B.mean(),2))

t统计量: 1.75   p值: 0.1178
A组均值: 87.2  B组均值: 83.0


def cohens_d_independent(a, b):
    a = np.asarray(a, dtype=float); b = np.asarray(b, dtype=float)
    n1, n2 = len(a), len(b)
    s1, s2 = a.std(ddof=1), b.std(ddof=1)
    # 合并标准差（无偏估计）
    sp = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2) / (n1+n2-2))
    return (a.mean() - b.mean()) / sp

d = cohens_d_independent(A, B)
print("Cohen's d:", round(d, 3))

Cohen's d: 1.109


# === 在此动手做题（你可以多建几个单元格）===

import numpy as np
import pandas as pd

# 练习 1：单样本 t 检验（均值=85）
sample1 = np.array([82, 85, 87, 90, 88, 86], dtype=float)
# TODO: 计算 t 与 p，并写出自然语言结论


# 练习 2：95% 置信区间（均值）
# TODO: 计算 95% CI，并用一句话解释


# 练习 3：两独立样本 t 检验 + 效应量
A = np.array([80, 83, 85, 88, 90], dtype=float)
B = np.array([76, 78, 80, 82, 85], dtype=float)
# TODO: Welch t 检验（equal_var=False），并计算 Cohen's d


# 练习 4：简单回归
# TODO: 自拟 X 与 Y，使用 statsmodels（或 numpy 兜底）拟合，输出系数与 R²


# 练习 1 参考答案
import numpy as np
sample1 = np.array([82, 85, 87, 90, 88, 86], dtype=float)

try:
    from scipy import stats
    t1, p1 = stats.ttest_1samp(sample1, 85)
except Exception as e:
    from math import sqrt, erf
    n = len(sample1)
    mean = sample1.mean()
    std = sample1.std(ddof=1)
    t1 = (mean - 85) / (std/np.sqrt(n))
    # 正态近似 p 值
    p1 = 2*(1-0.5*(1+erf(abs(t1)/np.sqrt(2))))

print("t统计量:", round(t1,2), " p值:", round(p1,4))
if p1 < 0.05:
    print("结论：在 5% 显著性水平下，拒绝 H0（总体均值=85）的假设。")
else:
    print("结论：在 5% 显著性水平下，无法拒绝 H0（总体均值=85）。")

t统计量: 1.2  p值: 0.2856
结论：在 5% 显著性水平下，无法拒绝 H0（总体均值=85）。


# 练习 2 参考答案（95% CI）
import numpy as np
sample1 = np.array([82, 85, 87, 90, 88, 86], dtype=float)

mean = sample1.mean()
std = sample1.std(ddof=1)
n = len(sample1)

try:
    from scipy import stats
    t_crit = stats.t.ppf(0.975, df=n-1)
except Exception as e:
    t_crit = 1.96  # 兜底近似

me = t_crit * (std/np.sqrt(n))
ci = (mean - me, mean + me)

print("均值:", round(mean,2), " 95%CI:", (round(ci[0],2), round(ci[1],2)))
print("解释：若重复抽样多次，约 95% 的此类区间会覆盖真实总体均值；并非“参数有 95% 概率在区间内”。")

均值: 86.33  95%CI: (83.47, 89.2)
解释：若重复抽样多次，约 95% 的此类区间会覆盖真实总体均值；并非“参数有 95% 概率在区间内”。


# 练习 3 参考答案：Welch t 检验 + Cohen's d
import numpy as np

A = np.array([80, 83, 85, 88, 90], dtype=float)
B = np.array([76, 78, 80, 82, 85], dtype=float)

try:
    from scipy import stats
    t2, p2 = stats.ttest_ind(A, B, equal_var=False)
except Exception as e:
    # 兜底：正态近似
    n1, n2 = len(A), len(B)
    mean1, mean2 = A.mean(), B.mean()
    s1, s2 = A.std(ddof=1), B.std(ddof=1)
    se = np.sqrt(s1**2/n1 + s2**2/n2)
    t2 = (mean1 - mean2)/se
    from math import erf, sqrt
    p2 = 2*(1-0.5*(1+erf(abs(t2)/sqrt(2))))

def cohens_d_independent(a, b):
    a = np.asarray(a, dtype=float); b = np.asarray(b, dtype=float)
    n1, n2 = len(a), len(b)
    s1, s2 = a.std(ddof=1), b.std(ddof=1)
    sp = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2) / (n1+n2-2))
    return (a.mean() - b.mean()) / sp

d = cohens_d_independent(A, B)

print("Welch t:", round(t2,2), " p值:", round(p2,4))
print("Cohen's d:", round(d,3))

Welch t: 2.12  p值: 0.0677
Cohen's d: 1.339


# 练习 4 参考答案：简单回归
import numpy as np

X_raw = np.array([1,2,3,4,5,6], dtype=float)
Y = 1.5 + 0.8*X_raw + np.array([0.2, -0.1, 0.0, 0.1, -0.2, 0.3])  # 线性关系 + 少量噪声

try:
    import statsmodels.api as sm
    X = sm.add_constant(X_raw)
    model = sm.OLS(Y, X).fit()
    print(model.summary())
except Exception as e:
    X = np.column_stack([np.ones_like(X_raw), X_raw])
    beta_hat, *_ = np.linalg.lstsq(X, Y, rcond=None)
    y_hat = X @ beta_hat
    resid = Y - y_hat
    rss = float((resid**2).sum())
    tss = float(((Y - Y.mean())**2).sum())
    r2 = 1 - rss/tss
    print("（简版 OLS 输出）")
    print(f"截距 β0 ≈ {beta_hat[0]:.4f}, 斜率 β1 ≈ {beta_hat[1]:.4f}, R² ≈ {r2:.4f}")

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.985
Model:                            OLS   Adj. R-squared:                  0.981
Method:                 Least Squares   F-statistic:                     263.5
Date:                Tue, 12 Aug 2025   Prob (F-statistic):           8.43e-05
Time:                        10:18:12   Log-Likelihood:                 2.1127
No. Observations:                   6   AIC:                           -0.2254
Df Residuals:                       4   BIC:                           -0.6418
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.5200      0.194      7.835      0.001       0.981       2.059
x1             0.8086      0.050     16.231      0.000       0.670       0.947
==============================================================================
Omnibus:                          nan   Durbin-Watson:                   2.583
Prob(Omnibus):                    nan   Jarque-Bera (JB):                0.416
Skew:                          -0.127   Prob(JB):                        0.812
Kurtosis:                       1.736   Cond. No.                         9.36
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

C:\Users\Zhouq\AppData\Roaming\Python\Python39\site-packages\statsmodels\stats\stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 6 samples were given.
  warn("omni_normtest is not valid with less than 8 observations; %i "

11.2 推断性统计（入门）¶

学习目标¶

11.2.1 语法要点（常用就这些）¶

A. 假设检验（以单样本 t 检验为例）¶

B. 置信区间（以均值为例）¶

C. 回归分析（线性回归 OLS）¶

11.2.2 示范（按教材示例逐一跑通）¶

示例 1：单样本 t 检验（均值是否为 85）¶

示例 2：95% 置信区间（均值）¶

示例 3：最小线性回归（OLS）¶

11.2.3 举例（贴近人文社科）¶

例 1：两独立样本 t 检验（比如：A/B 两种教学法的期末成绩）¶

例 2：效应量（Cohen's d）¶

例 3：回归的自然语言解读（示意）¶

11.2.4 注意事项（常见坑）¶

11.2.5 练习（建议先做再看答案）¶

11.2.6 参考答案（对照自检）¶

11.2 小结¶