p.adj wrap

449a8759 · rictjo · 6641eb27 · 449a8759 · 449a8759 · 449a8759
Showing with 92 addition and 20 deletion

README.md README.md +6 -3

setup.py setup.py +1 -1

src/impetuous/clustering.py src/impetuous/clustering.py +54 -12

src/impetuous/quantification.py src/impetuous/quantification.py +31 -4

未找到文件。
--- a/README.md
+++ b/README.md
@@ -9,11 +9,14 @@ The journal and analyte expression file must be ordered
 the same way with respect to the samples that are
 positioned on the columns.

-Visit the active code via:
+Visit the active code via :
 https://github.com/richardtjornhammar/impetuous

-Visit the published code: 
+Visit the published code : 
 https://doi.org/10.5281/zenodo.2594691

-Cite using 
+Cite using :
 DOI: 10.5281/zenodo.2594691
+
+Install with :
+pip install impetuous-gfa
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:

 setuptools.setup(
    name = "impetuous-gfa",
-    version = "0.2.30",
+    version = "0.2.32",
    author = "Richard Tjörnhammar",
    author_email = "richard.tjornhammar@gmail.com",
    description = "Impetuous Quantification, Enrichment and Group Factor Analysis",

--- a/src/impetuous/clustering.py
+++ b/src/impetuous/clustering.py
@@ -21,6 +21,7 @@ class Cluster(object):
        self.labels_ = None
        self.df_ = None
        self.num_index_ = None
+        self.components_ = None

    def approximate_density_clustering( self, df, nbins=None ) :
        #
@@ -31,6 +32,7 @@ class Cluster(object):
        self.df_= df
        frac_df = df.apply( lambda x:self.rankdata( x , method='average' )/float(len(x)) )
        self.pca_f.fit(frac_df.T.values)
+        self.components_ = self.pca_f.components_
        vals,xe,ye = self.histogram2d(self.pca_f.components_[0],self.pca_f.components_[1],bins=nbins)
        mvs, svsx, svsy = np.mean(vals),np.std(vals,0),np.std(vals,1)
        svs = np.sqrt(svsx**2+svsy**2)
@@ -57,38 +59,43 @@ class Cluster(object):
            for k,v in self.analyte_dict_.items() :
                print ( 'CLU-'+str(k),'\tDESCRIPTION\t'+'\t'.join(v), file=of )

-class ManifoldClustering ( Cluster ):
+class ManifoldClustering ( Cluster ) :
    def __init__( self , nbins=50 ) :
        from sklearn.cluster import KMeans
-        from sklearn.manifold import MDS
+        from sklearn.manifold import MDS, TSNE
        from numpy import histogram2d
        from scipy.stats import rankdata
        self.nbins = nbins
        self.histogram2d = histogram2d
        self.KMeans = KMeans
        self.rankdata = rankdata
-        self.mds = MDS( n_components=2 )
+        self.mds  = MDS ( n_components=2 )
+        self.tsne = TSNE ( n_components=2 )
+        self.man = None
        self.centroids_ = None
        self.labels_ = None
        self.df_ = None
        self.num_index_ = None
-        self.components_ =None
+        self.components_ = None

-    def approximate_embedding( self, df, nbins=None ) :
-        print ( 'WARNING::SLOW AND WASTEFUL' )
+    def approximate_embedding( self, df, nbins=None , use_tsne=True ) :
+        self.man = self.tsne
+        if not use_tsne :
+            self.man = self.mds
+            print ( 'WARNING::SLOW AND WASTEFUL' )
        if nbins is None :
            nbins = self.nbins
        self.df_= df
        frac_df = df.apply( lambda x:self.rankdata( x , method='average' )/float(len(x)) )
-        self.components_= np.array(self.mds.fit_transform(frac_df.values))
-
+        self.components_ = np.array(self.man.fit_transform(frac_df.values)).T
        vals,xe,ye = self.histogram2d(self.components_[0],self.components_[1],bins=nbins)
        mvs, svsx, svsy = np.mean(vals),np.std(vals,0),np.std(vals,1)
-        svs = np.sqrt(svsx**2+svsy**2)
+        svs = np.sqrt( svsx**2 + svsy**2 )
        #
        # IS THERE A DENSITY PEAK SEPARABLE FROM THE MEAN
        # SHOULD DO GRADIENT REJECTION BASED ON TTEST PVALUES
        hits = vals>mvs+0.5*svs
+        #print(hits,vals)
        xe_,ye_=0.5*(xe[:1]+xe[1:]),0.5*(ye[:1]+ye[1:])
        idx = np.where(hits); xi,yj = idx[0],idx[1]
        centroids = [ (xe[ri],ye[rj]) for (ri,rj) in zip(xi,yj) ]
@@ -111,14 +118,49 @@ def run_clustering_and_write_gmt( df , ca , filename = './approx_cluster_file.gm
            analytes = df.iloc[llabs==ulab].index.values
            print ( 'CLU-'+str(ulab),'\tDESCRIPTION\t'+'\t'.join(analytes), file=of )

+
+def make_clustering_visualisation_df ( CLUSTER , df=None , add_synonyms = False ,
+                                    output_name = 'feature_clusters_output.csv' 
+                                  ) :
+    x_pc1 = CLUSTER.components_[0]
+    y_pc2 = CLUSTER.components_[1]
+    L_C = len(CLUSTER.centroids_[0])
+    #
+    # MAKE CLUSTER COLORS
+    make_hex_colors = lambda c : '#%02x%02x%02x' % (c[0]%256,c[1]%256,c[2]%256)
+    C0 = [255,255,255] ; cluster_colors = []
+    #
+    for i in CLUSTER.labels_ :
+        C0_ = C0 ; C0_[i%3] = int(np.floor(C0[i%3]-(i/float(L_C))*255))
+        cluster_colors.append(make_hex_colors(C0_))
+
+    if not df is None :
+        if add_synonyms :
+            synonyms = [ ens2sym[df.index.values[i]][0] if df.index.values[i] in ens2sym \
+                else ens2sym_2[df.index.values[i]] if df.index.values[i] in ens2sym_2 \
+                else df.index.values[i] for i in range(len(px))]
+        else :
+            synonyms = df.index.values
+    #
+    data = []
+    for (x,y,t,cl,co) in zip( x_pc1,y_pc2,synonyms , [cl for cl in CLUSTER.labels_] ,
+                              [cluster_colors[cl] for cl in CLUSTER.labels_] ) :
+        data.append([x,y,t,cl,co])
+    clustering_df = pd.DataFrame( data , columns = ['X','Y','Type','Cluster','Color'])
+    if not df is None :
+        clustering_df.index =  df.index.values 
+    clustering_df.to_csv( output_name , '\t' )
+    return ( clustering_df )
+
 if __name__ == '__main__' :
    #
    # TEST DEPENDS ON THE DIABETES DATA FROM BROAD INSTITUTE
    filename = './Diabetes_collapsed_symbols.gct'
    df_ = pd.read_csv(filename,'\t',index_col=0,header=2)
-    ddf = df_.loc[:,[ col for col in df_.columns if '_' in col ]] ; ddf.index = [idx.split('/')[0] for idx in ddf.index]
-    run_clustering_and_write_gmt(ddf,clustering_algorithm)
-
+    ddf = df_.loc[:,[ col for col in df_.columns if '_' in col ]] 
+    ddf .index = [idx.split('/')[0] for idx in ddf.index]
+    run_clustering_and_write_gmt( ddf , clustering_algorithm )
+    #
    CLU = Cluster()
    CLU.approximate_density_clustering(ddf)
    CLU.write_gmt()

--- a/src/impetuous/quantification.py
+++ b/src/impetuous/quantification.py
@@ -30,13 +30,39 @@ def SubArraysOf(Array,Array_=None):
    return([Array]+SubArraysOf(Array[1:],Array_))

 def permuter( inputs , n ) :
-    # permuter( inputs=['T2D','NGT','Female','Male'] , n=2 ) 
+    # permuter( inputs=['T2D','NGT','Female','Male'] , n=2 )
    return( [p[0] for p in zip(itertools.permutations(inputs,n))] )

 def grouper ( inputs, n ):
    iters = [iter(inputs)] * n
    return zip(*iters)

+from statsmodels.stats.multitest import multipletests
+def adjust_p ( pvalue_list , method = 'fdr_bh' , alpha = 0.05,
+               check_r_bh = False , is_sorted = False ,
+               returnsorted = False
+             ) :
+    """  WRAPPER FOR MULTIPLE HYPOTHESIS TESTING
+    pvalue_list = [0.00001,0.01,0.0002,0.00005,0.01,0.1,0.2,0.4,0.5,0.6,0.7,0.8,0.9,0.99,0.0114,0.15,0.23,0.20]
+    """
+    available_methods = set( [ 'bonferroni' , 'sidak',
+           'holm-sidak' , 'holm' , 'simes-hochberg' ,
+           'hommel' , 'fdr_bh' , 'fdr_by' , 'fdr_tsbh' ,
+           'fdr_tsbky' ] )
+    if method not in available_methods :
+        print ( available_methods )
+    r_equiv = { 'fdr_bh':'BH' }
+    if check_r_bh and method in r_equiv :
+        from rpy2.robjects.packages import importr
+        from rpy2.robjects.vectors import FloatVector
+        r_stats = importr('stats')
+        p_adjust = r_stats.p_adjust ( FloatVector(pvalue_list), method = r_equiv[method] )
+    else :
+        p_adjust_results = multipletests ( pvalue_list, alpha=alpha, method=method, 
+                       is_sorted = is_sorted , returnsorted = returnsorted )
+        p_adjust = [ p_adj for p_adj in p_adjust_results[1] ]
+    return ( p_adjust )
+
 def qvalues ( p_values_in , pi0=None ) :
    p_s = p_values_in
    if pi0 is None :
@@ -159,7 +185,7 @@ def group_significance( subset , all_analytes_df = None ,
                        AllAnalytes = None , SigAnalytes = None,
                        alternative = 'greater' ) :
    # FISHER ODDS RATIO CHECK
-    # CHECK FOR ALTERNATIVE:
+    # CHECK FOR ALTERNATIVE :
    #   'greater'   ( ENRICHMENT IN GROUP )
    #   'two-sided' ( DIFFERENTIAL GROUP EXPERSSION )
    #   'less'      ( DEPLETION IN GROUP )
@@ -177,6 +203,7 @@ def group_significance( subset , all_analytes_df = None ,
    oddsratio , pval = stats.fisher_exact([[AB, nAB], [AnB, nAnB]], alternative=alternative)
    return ( pval , oddsratio )

+
 def quantify_groups_by_analyte_pvalues( analyte_df, grouping_file, delimiter='\t',
                                 tolerance = 0.05 , p_label = 'C(Status),p' ,
                                 group_prefix = '' ) :
@@ -374,8 +401,8 @@ if __name__ == '__main__' :
    test_type = 'random'

    path_ = './'
-    analyte_file = path_ + 'fine.txt'
-    journal_file = path_ + 'coarse.txt'
+    analyte_file  = path_ + 'fine.txt'
+    journal_file  = path_ + 'coarse.txt'
    grouping_file = path_ + 'groups.gmt'

    analyte_df = pd.read_csv(analyte_file,'\t' , index_col=0 )