Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
rictjo
impetuous
提交
449a8759
I
impetuous
项目概览
rictjo
/
impetuous
大约 1 年 前同步成功
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
I
impetuous
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
449a8759
编写于
5月 15, 2019
作者:
rictjo
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
p.adj wrap
上级
6641eb27
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
92 addition
and
20 deletion
+92
-20
README.md
README.md
+6
-3
setup.py
setup.py
+1
-1
src/impetuous/clustering.py
src/impetuous/clustering.py
+54
-12
src/impetuous/quantification.py
src/impetuous/quantification.py
+31
-4
未找到文件。
README.md
浏览文件 @
449a8759
...
...
@@ -9,11 +9,14 @@ The journal and analyte expression file must be ordered
the same way with respect to the samples that are
positioned on the columns.
Visit the active code via:
Visit the active code via
:
https://github.com/richardtjornhammar/impetuous
Visit the published code:
Visit the published code
:
https://doi.org/10.5281/zenodo.2594691
Cite using
Cite using
:
DOI: 10.5281/zenodo.2594691
Install with :
pip install impetuous-gfa
setup.py
浏览文件 @
449a8759
...
...
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
setuptools
.
setup
(
name
=
"impetuous-gfa"
,
version
=
"0.2.3
0
"
,
version
=
"0.2.3
2
"
,
author
=
"Richard Tjörnhammar"
,
author_email
=
"richard.tjornhammar@gmail.com"
,
description
=
"Impetuous Quantification, Enrichment and Group Factor Analysis"
,
...
...
src/impetuous/clustering.py
浏览文件 @
449a8759
...
...
@@ -21,6 +21,7 @@ class Cluster(object):
self
.
labels_
=
None
self
.
df_
=
None
self
.
num_index_
=
None
self
.
components_
=
None
def
approximate_density_clustering
(
self
,
df
,
nbins
=
None
)
:
#
...
...
@@ -31,6 +32,7 @@ class Cluster(object):
self
.
df_
=
df
frac_df
=
df
.
apply
(
lambda
x
:
self
.
rankdata
(
x
,
method
=
'average'
)
/
float
(
len
(
x
))
)
self
.
pca_f
.
fit
(
frac_df
.
T
.
values
)
self
.
components_
=
self
.
pca_f
.
components_
vals
,
xe
,
ye
=
self
.
histogram2d
(
self
.
pca_f
.
components_
[
0
],
self
.
pca_f
.
components_
[
1
],
bins
=
nbins
)
mvs
,
svsx
,
svsy
=
np
.
mean
(
vals
),
np
.
std
(
vals
,
0
),
np
.
std
(
vals
,
1
)
svs
=
np
.
sqrt
(
svsx
**
2
+
svsy
**
2
)
...
...
@@ -57,38 +59,43 @@ class Cluster(object):
for
k
,
v
in
self
.
analyte_dict_
.
items
()
:
print
(
'CLU-'
+
str
(
k
),
'
\t
DESCRIPTION
\t
'
+
'
\t
'
.
join
(
v
),
file
=
of
)
class
ManifoldClustering
(
Cluster
):
class
ManifoldClustering
(
Cluster
)
:
def
__init__
(
self
,
nbins
=
50
)
:
from
sklearn.cluster
import
KMeans
from
sklearn.manifold
import
MDS
from
sklearn.manifold
import
MDS
,
TSNE
from
numpy
import
histogram2d
from
scipy.stats
import
rankdata
self
.
nbins
=
nbins
self
.
histogram2d
=
histogram2d
self
.
KMeans
=
KMeans
self
.
rankdata
=
rankdata
self
.
mds
=
MDS
(
n_components
=
2
)
self
.
mds
=
MDS
(
n_components
=
2
)
self
.
tsne
=
TSNE
(
n_components
=
2
)
self
.
man
=
None
self
.
centroids_
=
None
self
.
labels_
=
None
self
.
df_
=
None
self
.
num_index_
=
None
self
.
components_
=
None
self
.
components_
=
None
def
approximate_embedding
(
self
,
df
,
nbins
=
None
)
:
print
(
'WARNING::SLOW AND WASTEFUL'
)
def
approximate_embedding
(
self
,
df
,
nbins
=
None
,
use_tsne
=
True
)
:
self
.
man
=
self
.
tsne
if
not
use_tsne
:
self
.
man
=
self
.
mds
print
(
'WARNING::SLOW AND WASTEFUL'
)
if
nbins
is
None
:
nbins
=
self
.
nbins
self
.
df_
=
df
frac_df
=
df
.
apply
(
lambda
x
:
self
.
rankdata
(
x
,
method
=
'average'
)
/
float
(
len
(
x
))
)
self
.
components_
=
np
.
array
(
self
.
mds
.
fit_transform
(
frac_df
.
values
))
self
.
components_
=
np
.
array
(
self
.
man
.
fit_transform
(
frac_df
.
values
)).
T
vals
,
xe
,
ye
=
self
.
histogram2d
(
self
.
components_
[
0
],
self
.
components_
[
1
],
bins
=
nbins
)
mvs
,
svsx
,
svsy
=
np
.
mean
(
vals
),
np
.
std
(
vals
,
0
),
np
.
std
(
vals
,
1
)
svs
=
np
.
sqrt
(
svsx
**
2
+
svsy
**
2
)
svs
=
np
.
sqrt
(
svsx
**
2
+
svsy
**
2
)
#
# IS THERE A DENSITY PEAK SEPARABLE FROM THE MEAN
# SHOULD DO GRADIENT REJECTION BASED ON TTEST PVALUES
hits
=
vals
>
mvs
+
0.5
*
svs
#print(hits,vals)
xe_
,
ye_
=
0.5
*
(
xe
[:
1
]
+
xe
[
1
:]),
0.5
*
(
ye
[:
1
]
+
ye
[
1
:])
idx
=
np
.
where
(
hits
);
xi
,
yj
=
idx
[
0
],
idx
[
1
]
centroids
=
[
(
xe
[
ri
],
ye
[
rj
])
for
(
ri
,
rj
)
in
zip
(
xi
,
yj
)
]
...
...
@@ -111,14 +118,49 @@ def run_clustering_and_write_gmt( df , ca , filename = './approx_cluster_file.gm
analytes
=
df
.
iloc
[
llabs
==
ulab
].
index
.
values
print
(
'CLU-'
+
str
(
ulab
),
'
\t
DESCRIPTION
\t
'
+
'
\t
'
.
join
(
analytes
),
file
=
of
)
def
make_clustering_visualisation_df
(
CLUSTER
,
df
=
None
,
add_synonyms
=
False
,
output_name
=
'feature_clusters_output.csv'
)
:
x_pc1
=
CLUSTER
.
components_
[
0
]
y_pc2
=
CLUSTER
.
components_
[
1
]
L_C
=
len
(
CLUSTER
.
centroids_
[
0
])
#
# MAKE CLUSTER COLORS
make_hex_colors
=
lambda
c
:
'#%02x%02x%02x'
%
(
c
[
0
]
%
256
,
c
[
1
]
%
256
,
c
[
2
]
%
256
)
C0
=
[
255
,
255
,
255
]
;
cluster_colors
=
[]
#
for
i
in
CLUSTER
.
labels_
:
C0_
=
C0
;
C0_
[
i
%
3
]
=
int
(
np
.
floor
(
C0
[
i
%
3
]
-
(
i
/
float
(
L_C
))
*
255
))
cluster_colors
.
append
(
make_hex_colors
(
C0_
))
if
not
df
is
None
:
if
add_synonyms
:
synonyms
=
[
ens2sym
[
df
.
index
.
values
[
i
]][
0
]
if
df
.
index
.
values
[
i
]
in
ens2sym
\
else
ens2sym_2
[
df
.
index
.
values
[
i
]]
if
df
.
index
.
values
[
i
]
in
ens2sym_2
\
else
df
.
index
.
values
[
i
]
for
i
in
range
(
len
(
px
))]
else
:
synonyms
=
df
.
index
.
values
#
data
=
[]
for
(
x
,
y
,
t
,
cl
,
co
)
in
zip
(
x_pc1
,
y_pc2
,
synonyms
,
[
cl
for
cl
in
CLUSTER
.
labels_
]
,
[
cluster_colors
[
cl
]
for
cl
in
CLUSTER
.
labels_
]
)
:
data
.
append
([
x
,
y
,
t
,
cl
,
co
])
clustering_df
=
pd
.
DataFrame
(
data
,
columns
=
[
'X'
,
'Y'
,
'Type'
,
'Cluster'
,
'Color'
])
if
not
df
is
None
:
clustering_df
.
index
=
df
.
index
.
values
clustering_df
.
to_csv
(
output_name
,
'
\t
'
)
return
(
clustering_df
)
if
__name__
==
'__main__'
:
#
# TEST DEPENDS ON THE DIABETES DATA FROM BROAD INSTITUTE
filename
=
'./Diabetes_collapsed_symbols.gct'
df_
=
pd
.
read_csv
(
filename
,
'
\t
'
,
index_col
=
0
,
header
=
2
)
ddf
=
df_
.
loc
[:,[
col
for
col
in
df_
.
columns
if
'_'
in
col
]]
;
ddf
.
index
=
[
idx
.
split
(
'/'
)[
0
]
for
idx
in
ddf
.
index
]
run_clustering_and_write_gmt
(
ddf
,
clustering_algorithm
)
ddf
=
df_
.
loc
[:,[
col
for
col
in
df_
.
columns
if
'_'
in
col
]]
ddf
.
index
=
[
idx
.
split
(
'/'
)[
0
]
for
idx
in
ddf
.
index
]
run_clustering_and_write_gmt
(
ddf
,
clustering_algorithm
)
#
CLU
=
Cluster
()
CLU
.
approximate_density_clustering
(
ddf
)
CLU
.
write_gmt
()
...
...
src/impetuous/quantification.py
浏览文件 @
449a8759
...
...
@@ -30,13 +30,39 @@ def SubArraysOf(Array,Array_=None):
return
([
Array
]
+
SubArraysOf
(
Array
[
1
:],
Array_
))
def
permuter
(
inputs
,
n
)
:
# permuter( inputs=['T2D','NGT','Female','Male'] , n=2 )
# permuter( inputs=['T2D','NGT','Female','Male'] , n=2 )
return
(
[
p
[
0
]
for
p
in
zip
(
itertools
.
permutations
(
inputs
,
n
))]
)
def
grouper
(
inputs
,
n
):
iters
=
[
iter
(
inputs
)]
*
n
return
zip
(
*
iters
)
from
statsmodels.stats.multitest
import
multipletests
def
adjust_p
(
pvalue_list
,
method
=
'fdr_bh'
,
alpha
=
0.05
,
check_r_bh
=
False
,
is_sorted
=
False
,
returnsorted
=
False
)
:
""" WRAPPER FOR MULTIPLE HYPOTHESIS TESTING
pvalue_list = [0.00001,0.01,0.0002,0.00005,0.01,0.1,0.2,0.4,0.5,0.6,0.7,0.8,0.9,0.99,0.0114,0.15,0.23,0.20]
"""
available_methods
=
set
(
[
'bonferroni'
,
'sidak'
,
'holm-sidak'
,
'holm'
,
'simes-hochberg'
,
'hommel'
,
'fdr_bh'
,
'fdr_by'
,
'fdr_tsbh'
,
'fdr_tsbky'
]
)
if
method
not
in
available_methods
:
print
(
available_methods
)
r_equiv
=
{
'fdr_bh'
:
'BH'
}
if
check_r_bh
and
method
in
r_equiv
:
from
rpy2.robjects.packages
import
importr
from
rpy2.robjects.vectors
import
FloatVector
r_stats
=
importr
(
'stats'
)
p_adjust
=
r_stats
.
p_adjust
(
FloatVector
(
pvalue_list
),
method
=
r_equiv
[
method
]
)
else
:
p_adjust_results
=
multipletests
(
pvalue_list
,
alpha
=
alpha
,
method
=
method
,
is_sorted
=
is_sorted
,
returnsorted
=
returnsorted
)
p_adjust
=
[
p_adj
for
p_adj
in
p_adjust_results
[
1
]
]
return
(
p_adjust
)
def
qvalues
(
p_values_in
,
pi0
=
None
)
:
p_s
=
p_values_in
if
pi0
is
None
:
...
...
@@ -159,7 +185,7 @@ def group_significance( subset , all_analytes_df = None ,
AllAnalytes
=
None
,
SigAnalytes
=
None
,
alternative
=
'greater'
)
:
# FISHER ODDS RATIO CHECK
# CHECK FOR ALTERNATIVE:
# CHECK FOR ALTERNATIVE
:
# 'greater' ( ENRICHMENT IN GROUP )
# 'two-sided' ( DIFFERENTIAL GROUP EXPERSSION )
# 'less' ( DEPLETION IN GROUP )
...
...
@@ -177,6 +203,7 @@ def group_significance( subset , all_analytes_df = None ,
oddsratio
,
pval
=
stats
.
fisher_exact
([[
AB
,
nAB
],
[
AnB
,
nAnB
]],
alternative
=
alternative
)
return
(
pval
,
oddsratio
)
def
quantify_groups_by_analyte_pvalues
(
analyte_df
,
grouping_file
,
delimiter
=
'
\t
'
,
tolerance
=
0.05
,
p_label
=
'C(Status),p'
,
group_prefix
=
''
)
:
...
...
@@ -374,8 +401,8 @@ if __name__ == '__main__' :
test_type
=
'random'
path_
=
'./'
analyte_file
=
path_
+
'fine.txt'
journal_file
=
path_
+
'coarse.txt'
analyte_file
=
path_
+
'fine.txt'
journal_file
=
path_
+
'coarse.txt'
grouping_file
=
path_
+
'groups.gmt'
analyte_df
=
pd
.
read_csv
(
analyte_file
,
'
\t
'
,
index_col
=
0
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录