提交 449a8759 编写于 作者: rictjo's avatar rictjo

p.adj wrap

上级 6641eb27
......@@ -9,11 +9,14 @@ The journal and analyte expression file must be ordered
the same way with respect to the samples that are
positioned on the columns.
Visit the active code via:
Visit the active code via :
https://github.com/richardtjornhammar/impetuous
Visit the published code:
Visit the published code :
https://doi.org/10.5281/zenodo.2594691
Cite using
Cite using :
DOI: 10.5281/zenodo.2594691
Install with :
pip install impetuous-gfa
......@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
setuptools.setup(
name = "impetuous-gfa",
version = "0.2.30",
version = "0.2.32",
author = "Richard Tjörnhammar",
author_email = "richard.tjornhammar@gmail.com",
description = "Impetuous Quantification, Enrichment and Group Factor Analysis",
......
......@@ -21,6 +21,7 @@ class Cluster(object):
self.labels_ = None
self.df_ = None
self.num_index_ = None
self.components_ = None
def approximate_density_clustering( self, df, nbins=None ) :
#
......@@ -31,6 +32,7 @@ class Cluster(object):
self.df_= df
frac_df = df.apply( lambda x:self.rankdata( x , method='average' )/float(len(x)) )
self.pca_f.fit(frac_df.T.values)
self.components_ = self.pca_f.components_
vals,xe,ye = self.histogram2d(self.pca_f.components_[0],self.pca_f.components_[1],bins=nbins)
mvs, svsx, svsy = np.mean(vals),np.std(vals,0),np.std(vals,1)
svs = np.sqrt(svsx**2+svsy**2)
......@@ -57,38 +59,43 @@ class Cluster(object):
for k,v in self.analyte_dict_.items() :
print ( 'CLU-'+str(k),'\tDESCRIPTION\t'+'\t'.join(v), file=of )
class ManifoldClustering ( Cluster ):
class ManifoldClustering ( Cluster ) :
def __init__( self , nbins=50 ) :
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from sklearn.manifold import MDS, TSNE
from numpy import histogram2d
from scipy.stats import rankdata
self.nbins = nbins
self.histogram2d = histogram2d
self.KMeans = KMeans
self.rankdata = rankdata
self.mds = MDS( n_components=2 )
self.mds = MDS ( n_components=2 )
self.tsne = TSNE ( n_components=2 )
self.man = None
self.centroids_ = None
self.labels_ = None
self.df_ = None
self.num_index_ = None
self.components_ =None
self.components_ = None
def approximate_embedding( self, df, nbins=None ) :
print ( 'WARNING::SLOW AND WASTEFUL' )
def approximate_embedding( self, df, nbins=None , use_tsne=True ) :
self.man = self.tsne
if not use_tsne :
self.man = self.mds
print ( 'WARNING::SLOW AND WASTEFUL' )
if nbins is None :
nbins = self.nbins
self.df_= df
frac_df = df.apply( lambda x:self.rankdata( x , method='average' )/float(len(x)) )
self.components_= np.array(self.mds.fit_transform(frac_df.values))
self.components_ = np.array(self.man.fit_transform(frac_df.values)).T
vals,xe,ye = self.histogram2d(self.components_[0],self.components_[1],bins=nbins)
mvs, svsx, svsy = np.mean(vals),np.std(vals,0),np.std(vals,1)
svs = np.sqrt(svsx**2+svsy**2)
svs = np.sqrt( svsx**2 + svsy**2 )
#
# IS THERE A DENSITY PEAK SEPARABLE FROM THE MEAN
# SHOULD DO GRADIENT REJECTION BASED ON TTEST PVALUES
hits = vals>mvs+0.5*svs
#print(hits,vals)
xe_,ye_=0.5*(xe[:1]+xe[1:]),0.5*(ye[:1]+ye[1:])
idx = np.where(hits); xi,yj = idx[0],idx[1]
centroids = [ (xe[ri],ye[rj]) for (ri,rj) in zip(xi,yj) ]
......@@ -111,14 +118,49 @@ def run_clustering_and_write_gmt( df , ca , filename = './approx_cluster_file.gm
analytes = df.iloc[llabs==ulab].index.values
print ( 'CLU-'+str(ulab),'\tDESCRIPTION\t'+'\t'.join(analytes), file=of )
def make_clustering_visualisation_df ( CLUSTER , df=None , add_synonyms = False ,
output_name = 'feature_clusters_output.csv'
) :
x_pc1 = CLUSTER.components_[0]
y_pc2 = CLUSTER.components_[1]
L_C = len(CLUSTER.centroids_[0])
#
# MAKE CLUSTER COLORS
make_hex_colors = lambda c : '#%02x%02x%02x' % (c[0]%256,c[1]%256,c[2]%256)
C0 = [255,255,255] ; cluster_colors = []
#
for i in CLUSTER.labels_ :
C0_ = C0 ; C0_[i%3] = int(np.floor(C0[i%3]-(i/float(L_C))*255))
cluster_colors.append(make_hex_colors(C0_))
if not df is None :
if add_synonyms :
synonyms = [ ens2sym[df.index.values[i]][0] if df.index.values[i] in ens2sym \
else ens2sym_2[df.index.values[i]] if df.index.values[i] in ens2sym_2 \
else df.index.values[i] for i in range(len(px))]
else :
synonyms = df.index.values
#
data = []
for (x,y,t,cl,co) in zip( x_pc1,y_pc2,synonyms , [cl for cl in CLUSTER.labels_] ,
[cluster_colors[cl] for cl in CLUSTER.labels_] ) :
data.append([x,y,t,cl,co])
clustering_df = pd.DataFrame( data , columns = ['X','Y','Type','Cluster','Color'])
if not df is None :
clustering_df.index = df.index.values
clustering_df.to_csv( output_name , '\t' )
return ( clustering_df )
if __name__ == '__main__' :
#
# TEST DEPENDS ON THE DIABETES DATA FROM BROAD INSTITUTE
filename = './Diabetes_collapsed_symbols.gct'
df_ = pd.read_csv(filename,'\t',index_col=0,header=2)
ddf = df_.loc[:,[ col for col in df_.columns if '_' in col ]] ; ddf.index = [idx.split('/')[0] for idx in ddf.index]
run_clustering_and_write_gmt(ddf,clustering_algorithm)
ddf = df_.loc[:,[ col for col in df_.columns if '_' in col ]]
ddf .index = [idx.split('/')[0] for idx in ddf.index]
run_clustering_and_write_gmt( ddf , clustering_algorithm )
#
CLU = Cluster()
CLU.approximate_density_clustering(ddf)
CLU.write_gmt()
......
......@@ -30,13 +30,39 @@ def SubArraysOf(Array,Array_=None):
return([Array]+SubArraysOf(Array[1:],Array_))
def permuter( inputs , n ) :
# permuter( inputs=['T2D','NGT','Female','Male'] , n=2 )
# permuter( inputs=['T2D','NGT','Female','Male'] , n=2 )
return( [p[0] for p in zip(itertools.permutations(inputs,n))] )
def grouper ( inputs, n ):
iters = [iter(inputs)] * n
return zip(*iters)
from statsmodels.stats.multitest import multipletests
def adjust_p ( pvalue_list , method = 'fdr_bh' , alpha = 0.05,
check_r_bh = False , is_sorted = False ,
returnsorted = False
) :
""" WRAPPER FOR MULTIPLE HYPOTHESIS TESTING
pvalue_list = [0.00001,0.01,0.0002,0.00005,0.01,0.1,0.2,0.4,0.5,0.6,0.7,0.8,0.9,0.99,0.0114,0.15,0.23,0.20]
"""
available_methods = set( [ 'bonferroni' , 'sidak',
'holm-sidak' , 'holm' , 'simes-hochberg' ,
'hommel' , 'fdr_bh' , 'fdr_by' , 'fdr_tsbh' ,
'fdr_tsbky' ] )
if method not in available_methods :
print ( available_methods )
r_equiv = { 'fdr_bh':'BH' }
if check_r_bh and method in r_equiv :
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
r_stats = importr('stats')
p_adjust = r_stats.p_adjust ( FloatVector(pvalue_list), method = r_equiv[method] )
else :
p_adjust_results = multipletests ( pvalue_list, alpha=alpha, method=method,
is_sorted = is_sorted , returnsorted = returnsorted )
p_adjust = [ p_adj for p_adj in p_adjust_results[1] ]
return ( p_adjust )
def qvalues ( p_values_in , pi0=None ) :
p_s = p_values_in
if pi0 is None :
......@@ -159,7 +185,7 @@ def group_significance( subset , all_analytes_df = None ,
AllAnalytes = None , SigAnalytes = None,
alternative = 'greater' ) :
# FISHER ODDS RATIO CHECK
# CHECK FOR ALTERNATIVE:
# CHECK FOR ALTERNATIVE :
# 'greater' ( ENRICHMENT IN GROUP )
# 'two-sided' ( DIFFERENTIAL GROUP EXPERSSION )
# 'less' ( DEPLETION IN GROUP )
......@@ -177,6 +203,7 @@ def group_significance( subset , all_analytes_df = None ,
oddsratio , pval = stats.fisher_exact([[AB, nAB], [AnB, nAnB]], alternative=alternative)
return ( pval , oddsratio )
def quantify_groups_by_analyte_pvalues( analyte_df, grouping_file, delimiter='\t',
tolerance = 0.05 , p_label = 'C(Status),p' ,
group_prefix = '' ) :
......@@ -374,8 +401,8 @@ if __name__ == '__main__' :
test_type = 'random'
path_ = './'
analyte_file = path_ + 'fine.txt'
journal_file = path_ + 'coarse.txt'
analyte_file = path_ + 'fine.txt'
journal_file = path_ + 'coarse.txt'
grouping_file = path_ + 'groups.gmt'
analyte_df = pd.read_csv(analyte_file,'\t' , index_col=0 )
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册