Ex. 18.19
Ex. 18.19
Consider the data in Table 18.4 of Section (18.7), available from the book website.
(a) Using a symmetric two-sided rejection region based on the \(t\)-statistic, compute the plug-in estimate of the FDR for various values of the cut-point.
(b) Carry out the BH procedure for various FDR levels \(\alpha\) and show the equivalence of your results, with those from part (a).
(c) Let \((q_{.25}, q_{.75})\) be the quartiles of the \(t\)-statistics from the permuted datasets. Let \(\hat\pi_0 = \{\# t_j\in (q_{.25}, q_{.75})\}/(.5M)\), and set \(\hat\pi_0=\min(\hat\pi_0, 1)\). Multiply the FDR estimates from (a) by \(\hat\pi_0\) and examine the results.
(d) Give a motivation for the estimate in part (c).
(Storey, 2003 \cite{storey2003positive}).
Soln. 18.19
See Code below for (a)- (c).
(d) This is an estimate of \(M_0/M\) in (18.45).
Code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79 | import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.stats.multitest as mt
import pathlib
import timeit
# get relative data folder
PATH = pathlib.Path(__file__).resolve().parents[1]
DATA_PATH = PATH.joinpath("data").resolve()
# data
X = pd.read_csv(DATA_PATH.joinpath("microarray.csv"), header=0)
def t_wrapper(arr, ind1=None, ind2=None):
s1 = np.asarray(arr[ind1])
s2 = np.asarray(arr[ind2])
return stats.ttest_ind(s1, s2).statistic
def perm_t_wrapper(arr, perms):
res = []
for perm in perms:
ind1 = perm[:44]
ind2 = perm[44:]
t = t_wrapper(arr, ind1=ind1, ind2=ind2)
res.append(t)
return res
def get_t_stats(X, ind1, ind2):
res = X.apply(t_wrapper, axis=1, ind1=ind1, ind2=ind2)
return np.asarray(res)
def get_tjk(X, K=1000):
perms = []
for _ in range(K):
perms.append(np.random.permutation(58))
res = X.apply(perm_t_wrapper, axis=1, perms=perms)
return res
def get_FDR(t_orig, tjk, C, K):
R_obs = (abs(t_orig) > C).sum()
E_V = (abs(tjk) > C).sum() / K
FDR = E_V / R_obs
return FDR
def get_p_value(t_orig, tjk):
p, K = tjk.shape
p_values = np.arange(p)
for p_ in range(p):
p_values[p_] = np.count_nonzero(tjk[p_] > t_orig[p_]) / K
p_values.sort()
return p_values
normal_indices = np.arange(44)
sensitive_indices = np.arange(44, 58)
t_orig = get_t_stats(X, normal_indices, sensitive_indices)
tjk = get_tjk(X, K=1000) # ~ 20mins
C = [4, 4.101, 4.2]
plugin_FDR = []
for c in C:
plugin_FDR.append(get_FDR(t_orig, np.asarray(tjk), c))
p_values = get_p_value(t_orig, np.asarray(tjk))
alphas = plugin_FDR
L = []
for alpha in alphas:
res = mt.multipletests(p_values, alpha, is_sorted=True)
L.append(np.count_nonzero(res.reject))
print(L)
|