提交 3e6127a3 编写于 作者: rictjo's avatar rictjo

cleaning

上级 ae1c00c5
"""
Copyright 2020 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import pandas as pd
import numpy as np
import sys
import sklearn.cluster as sc
try :
from numba import jit
bUseNumba = True
except ImportError :
bUseNumba = False
# THE FOLLOWING KMEANS ALGORITHM IS THE AUTHOR OWN LOCAL VERSION
if bUseNumba :
@jit(nopython=True)
def seeded_kmeans( dat, cent ):
#
# PYTHON ADAPTATION OF MY C++ CODE THAT CAN BE FOUND IN
# https://github.com/richardtjornhammar/RichTools/blob/master/src/cluster.cc
# AROUND LINE 2345
# AGAIN CONSIDER USING THE C++ VERSION SINCE IT IS ALOT FASTER
# HERE WE SPEED IT UP USING NUMBA IF THE USER HAS IT INSTALLED AS A MODULE
#
NN , MM = np.shape ( dat )
KK , LL = np.shape ( cent )
if not LL == MM :
print ( 'WARNING DATA FORMAT ERROR. NON COALESCING COORDINATE AXIS' )
labels = [ int(z) for z in np.zeros(NN) ]
w = labels
counts = np.zeros(KK)
tmp_ce = np.zeros(KK*MM).reshape(KK,MM)
old_error , error , TOL = 0. , 1. , 1.0E-10
while abs ( error - old_error ) > TOL :
old_error = error
error = 0.
counts = counts * 0.
tmp_ce = tmp_ce * 0.
# START BC
for h in range ( NN ) :
min_distance = 1.0E30
for i in range ( KK ) :
distance = np.sum( ( dat[h]-cent[i] )**2 )
if distance < min_distance :
labels[h] = i
min_distance = distance
tmp_ce[labels[h]] += dat[ h ]
counts[labels[h]] += 1.0
error += min_distance
# END BC
for i in range ( KK ) :
if counts[i]>0:
cent[i] = tmp_ce[i]/counts[i]
centroids = cent
return ( labels, centroids )
else :
def seeded_kmeans( dat, cent ):
#
# SLOW SLUGGISH KMEANS WITH A DUBBLE FOR LOOP
# IN PYTHON! WOW! SUCH SPEED!
#
NN , MM = np.shape ( dat )
KK , LL = np.shape ( cent )
if not LL == MM :
print ( 'WARNING DATA FORMAT ERROR. NON COALESCING COORDINATE AXIS' )
labels = [ int(z) for z in np.zeros(NN) ]
w = labels
counts = np.zeros(KK)
tmp_ce = np.zeros(KK*MM).reshape(KK,MM)
old_error , error , TOL = 0. , 1. , 1.0E-10
while abs ( error - old_error ) > TOL :
old_error = error
error = 0.
counts = counts * 0.
tmp_ce = tmp_ce * 0.
# START BC
for h in range ( NN ) :
min_distance = 1.0E30
for i in range ( KK ) :
distance = np.sum( ( dat[h]-cent[i] )**2 )
if distance < min_distance :
labels[h] = i
min_distance = distance
tmp_ce[labels[h]] += dat[ h ]
counts[labels[h]] += 1.0
error += min_distance
# END BC
for i in range ( KK ) :
if counts[i]>0:
cent[i] = tmp_ce[i]/counts[i]
centroids = cent
return ( labels, centroids )
def connectivity ( B , val, bVerbose=True ) :
description="""
This is a cutoff based clustering algorithm. The intended use is to supply a distance matrix and a cutoff value (then becomes symmetric positive definite). For a small distance cutoff, you should see all the parts of the system and for a large distance cutoff, you should see the entire system. It has been employed for statistical analysis work as well as the original application where it was employed to segment molecular systems.
"""
# PYTHON ADAPTATION OF MY C++ CODE THAT CAN BE FOUND IN
# https://github.com/richardtjornhammar/RichTools/blob/master/src/cluster.cc
# AROUND LINE 2277
# CONSIDER COMPILING AND USING THAT AS A MODULE INSTEAD OF THIS SINCE IT IS
# A LOT FASTER
# FOR A DESCRIPTION READ PAGE 30 (16 INTERNAL NUMBERING) of:
# https://kth.diva-portal.org/smash/get/diva2:748464/FULLTEXT01.pdf
#
nr_sq,mr_sq = np.shape(B)
if nr_sq != mr_sq :
print ( 'ERROR' )
exit (1)
N = mr_sq
res , nvisi, s, NN, ndx, C = [], [], [], [], [], 0
res .append(0)
for i in range(N) :
nvisi.append(i+1)
res.append(0); res.append(0)
ndx.append(i)
while ( len(ndx)>0 ) :
i = ndx[-1] ; ndx = ndx[:-1]
NN = []
if ( nvisi[i]>0 ) :
C-=1
for j in range(N) :
if ( B[i,j]<val ) :
NN.append(j)
while ( len(NN)>0 ) :
# back pop_back
k = NN[-1]; NN = NN[:-1]
nvisi[k] = C
for j in range(N):
if ( B[j,k]<val ) :
for q in range(N) :
if ( nvisi[q] == j+1 ) :
NN.append(q)
if bVerbose : # VERBOSE
print("INFO "+str(-1*C) +" clusters" )
Nc = [ 0 for i in range(-1*C) ]
for q in range(N) :
res[ q*2+1 ] = q;
res[ q*2 ] = nvisi[q]-C;
Nc [res[q*2]]+= 1;
if bVerbose :
print ( " "+str(res[q*2])+" "+str(res[2*q+1]) )
for i in range(-1*C) :
print( "CLUSTER " +str(i)+ " HAS " + str(Nc[i]) + " ELEMENTS")
return ( Nc , np.array(res[:-1]).reshape(-1,2) )
clustering_algorithm = None
clustering_algorithm = sc.KMeans(10) # CHOOSE SOMETHING YOU LIKE NOT THIS
class Cluster(object):
def __init__( self, nbins=50, nclusters=-1 , use_ranks = False ) :
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from numpy import histogram2d
from scipy.stats import rankdata
self.use_ranks = use_ranks
self.nclusters = nclusters
self.nbins = nbins
self.histogram2d = histogram2d
self.KMeans = KMeans
self.rankdata = rankdata
self.pca_f = PCA(2)
self.centroids_ = None
self.labels_ = None
self.df_ = None
self.num_index_ = None
self.components_ = None
def approximate_density_clustering( self, df, nbins=None ) :
#
# GENES APPROX 20K OK SO APPROX 50 BINS
# ANALYTES ON ROWS, SAMPLE POINTS ON COLUMNS
if nbins is None :
nbins = self.nbins
self.df_= df
frac_df = df
if self.use_ranks :
frac_df .apply( lambda x:self.rankdata( x , method='average' )/float(len(x)) )
self.pca_f.fit(frac_df.T.values)
self.components_ = self.pca_f.components_
vals,xe,ye = self.histogram2d(self.pca_f.components_[0],self.pca_f.components_[1],bins=nbins)
mvs, svsx, svsy = np.mean(vals),np.std(vals,0),np.std(vals,1)
svs = np.sqrt(svsx**2+svsy**2)
#
# IS THERE A DENSITY PEAK SEPARABLE FROM THE MEAN
# SHOULD DO GRADIENT REJECTION BASED ON TTEST PVALUES
hits = vals>mvs+0.5*svs
#
xe_,ye_ = 0.5*(xe[:1]+xe[1:]) , 0.5*(ye[:1]+ye[1:])
idx = np.where(hits); xi,yj = idx[0],idx[1]
centroids = [ (xe[ri],ye[rj]) for (ri,rj) in zip(xi,yj) ]
if self.nclusters == -1 :
self.nclusters = len ( centroids )
if self.nclusters < len ( centroids ) :
import heapq
from scipy.spatial import distance as distance_
a = distance_.cdist ( centroids, centroids, 'euclidean' )
cent_idx = heapq.nlargest ( self.nclusters, range(len(a)), a.reshape(-1).__getitem__ )
centroids = [ centroids[ idx ] for idx in cent_idx ]
kmeans = self.KMeans(len(centroids),init=np.array(centroids))
kmeans.fit(self.pca_f.components_.T)
centers = np.array(kmeans.cluster_centers_).T
self.labels_ = kmeans.labels_
self.centroids_ = centers
self.analyte_dict_ = { c:[] for c in self.labels_ }
[self.analyte_dict_[self.labels_[i]].append(df.index[i]) for i in range(len(self.labels_)) ]
return ( self.analyte_dict_ )
def write_gmt(self, filename = './cluster_file.gmt' ) :
with open(filename,'w') as of :
for k,v in self.analyte_dict_.items() :
print ( 'CLU-'+str(k),'\tDESCRIPTION\t'+'\t'.join(v), file=of )
class ManifoldClustering ( Cluster ) :
def __init__( self , nbins=50 ) :
from sklearn.cluster import KMeans
from sklearn.manifold import MDS, TSNE
from numpy import histogram2d
from scipy.stats import rankdata
self.nbins = nbins
self.histogram2d = histogram2d
self.KMeans = KMeans
self.rankdata = rankdata
self.mds = MDS ( n_components=2 )
self.tsne = TSNE ( n_components=2 )
self.man = None
self.centroids_ = None
self.labels_ = None
self.df_ = None
self.num_index_ = None
self.components_ = None
def approximate_embedding( self, df, nbins=None , use_tsne=True ) :
self.man = self.tsne
if not use_tsne :
self.man = self.mds
print ( 'WARNING::SLOW AND WASTEFUL' )
if nbins is None :
nbins = self.nbins
self.df_= df
frac_df = df.apply( lambda x:self.rankdata( x , method='average' )/float(len(x)) )
self.components_ = np.array(self.man.fit_transform(frac_df.values)).T
vals,xe,ye = self.histogram2d(self.components_[0],self.components_[1],bins=nbins)
mvs, svsx, svsy = np.mean(vals),np.std(vals,0),np.std(vals,1)
svs = np.sqrt( svsx**2 + svsy**2 )
#
# IS THERE A DENSITY PEAK SEPARABLE FROM THE MEAN
# SHOULD DO GRADIENT REJECTION BASED ON TTEST PVALUES
hits = vals>mvs+0.5*svs
#print(hits,vals)
xe_,ye_=0.5*(xe[:1]+xe[1:]),0.5*(ye[:1]+ye[1:])
idx = np.where(hits); xi,yj = idx[0],idx[1]
centroids = [ (xe[ri],ye[rj]) for (ri,rj) in zip(xi,yj) ]
#
kmeans = self.KMeans(len(centroids),init=np.array(centroids))
kmeans.fit(self.components_.T)
centers = np.array(kmeans.cluster_centers_).T
self.labels_ = kmeans.labels_
self.centroids_ = centers
self.analyte_dict_ = { c:[] for c in self.labels_ }
[self.analyte_dict_[self.labels_[i]].append(df.index[i]) for i in range(len(self.labels_)) ]
return ( self.analyte_dict_ )
def run_clustering_and_write_gmt( df , ca , filename = './approx_cluster_file.gmt' ) :
labels = ca.fit_predict(df.values)
llabs = [ l for l in labels ]; ulabs=set(llabs)
with open(filename,'w') as of :
for ulab in ulabs :
analytes = df.iloc[llabs==ulab].index.values
print ( 'CLU-'+str(ulab),'\tDESCRIPTION\t'+'\t'.join(analytes), file=of )
def make_clustering_visualisation_df ( CLUSTER , df=None , add_synonyms = False ,
output_name = 'feature_clusters_output.csv'
) :
x_pc1 = CLUSTER.components_[0]
y_pc2 = CLUSTER.components_[1]
L_C = len(CLUSTER.centroids_[0])
#
# MAKE CLUSTER COLORS
make_hex_colors = lambda c : '#%02x%02x%02x' % (c[0]%256,c[1]%256,c[2]%256)
C0 = [255,255,255] ; cluster_colors = []
#
for i in CLUSTER.labels_ :
C0_ = C0 ; C0_[i%3] = int(np.floor(C0[i%3]-(i/float(L_C))*255))
cluster_colors.append(make_hex_colors(C0_))
if not df is None :
if add_synonyms :
synonyms = [ ens2sym[df.index.values[i]][0] if df.index.values[i] in ens2sym \
else ens2sym_2[df.index.values[i]] if df.index.values[i] in ens2sym_2 \
else df.index.values[i] for i in range(len(px))]
else :
synonyms = df.index.values
data = []
for (x,y,t,cl,co) in zip( x_pc1,y_pc2,synonyms , [cl for cl in CLUSTER.labels_] ,
[cluster_colors[cl] for cl in CLUSTER.labels_] ) :
data.append([x,y,t,cl,co])
clustering_df = pd.DataFrame( data , columns = ['X','Y','Type','Cluster','Color'])
if not df is None :
clustering_df.index = df.index.values
clustering_df.to_csv( output_name , '\t' )
return ( clustering_df )
from scipy.spatial.distance import squareform , pdist
abolsute_coordinates_to_distance_matrix = lambda Q:squareform(pdist(Q))
if __name__ == '__main__' :
if False :
#
# TEST DEPENDS ON THE DIABETES DATA FROM BROAD INSTITUTE
filename = './Diabetes_collapsed_symbols.gct'
df_ = pd.read_csv(filename,'\t',index_col=0,header=2)
ddf = df_.loc[:,[ col for col in df_.columns if '_' in col ]]
ddf .index = [idx.split('/')[0] for idx in ddf.index]
run_clustering_and_write_gmt( ddf , clustering_algorithm )
#
CLU = Cluster( )
CLU .approximate_density_clustering(ddf)
CLU .write_gmt()
if False :
A = np.array( [ [0.00, 0.10, 0.10, 9.00, 9.00, 9.00],
[0.10, 0.00, 0.15, 9.00, 9.00, 9.00],
[0.10, 0.15, 0.00, 9.00, 9.00, 9.00],
[9.00, 9.00, 9.00, 0.00, 0.10, 0.10],
[9.10, 9.00, 9.00, 0.10, 0.00, 0.15],
[9.10, 9.00, 9.00, 0.10, 0.15, 0.00] ] )
print( connectivity(A,0.01) )
"""
Copyright 2019 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import pandas as pd
import numpy as np
import networkx as nx
from networkx.readwrite import json_graph
import json
import sys
def drop_duplicate_indices( df ):
df_ = df.loc[~df.index.duplicated(keep='first')]
return df_
def write_tree( tree , outfile='tree.json' ):
root = [ eid for eid,ancestor in tree.in_degree() if ancestor == 0 ][ 0 ]
o_json = json_graph.tree_data( tree , root )
if not outfile is None:
with open(outfile, 'w') as o_file:
json.dump(o_json, o_file )
return( o_json )
def add_attributes_to_tree ( p_df , tree ):
id_lookup = { rid:lid for (lid,rid) in nx.get_node_attributes(tree,'source').items() }
add_attributes = p_df.columns.values
for attribute in add_attributes:
propd = { id_lookup[idx]:{attribute:val} for (idx,val)
in zip(p_df.index.values,p_df.loc[:,attribute]) if idx in set(id_lookup.keys()) }
nx.set_node_attributes( tree , propd )
return( tree )
def parent_child_to_dag (
relationship_file = './PCLIST.txt' ,
i_p = 0 , i_c = 1
) :
n_df = pd .read_csv ( relationship_file , '\t' )
pair_tuples = [ (p,c) for (p,c) in zip(n_df.iloc[:,i_p],n_df.iloc[:,i_c]) ]
children_of = {} ; all_names = set([])
for ( p,c ) in pair_tuples :
if not 'list' in str(type(c)):
C = [c]
all_names = all_names | set([p]) | set(C)
if p in children_of :
children_of[p] .append(c)
else :
children_of[p] = [c]
G = nx .DiGraph()
G .add_nodes_from ( all_names )
G .add_edges_from ( pair_tuples )
tree = nx.algorithms.dag.dag_to_branching( G )
root = [ eid for eid,ancestor in tree.in_degree() if ancestor == 0 ][ 0 ]
descendants = [ ( idx , nx.algorithms.dag.descendants(G,idx) ) for idx in all_names ]
ancestors = [ ( idx , nx.algorithms.dag.ancestors(G,idx) ) for idx in all_names ]
return ( tree,ancestors,descendants )
def make_pathway_ancestor_data_frame(ancestors):
p_df = None
for k,v in ancestors :
t_df = pd.DataFrame([[','.join(list(v)),len(v)]],index=[k],columns=['DAG,ancestors','DAG,level'])
if p_df is None :
p_df = t_df
else :
p_df = pd.concat([p_df,t_df])
return( p_df )
def normalise_for_apples_and_oranges_stats( X , method='ordinal' ):
X_ = rankdata( X , method=method )/len(X)
return(X_)
def make_group_analytes_unique( grouping_file , delimiter='\t' ):
uniqe_grouping_file = '/'.join(grouping_file.split('/')[:-1]) + '/unique_'+grouping_file.split('/')[-1]
with open( uniqe_grouping_file , 'w' ) as of:
with open( grouping_file ) as input :
for line in input :
vline = line.replace('\n','').split(delimiter)
gid, gdesc, analytes_ = vline[0], vline[1], list(set(vline[2:]))
nvec = [gid,gdesc] ; [ nvec.append(a) for a in analytes_ ]
print ( delimiter.join(nvec) , file = of )
def read_conversions(file_name) :
gene2ens = {} ; non_unique = []
with open( file_name , 'r' ) as infile:
if sys.version_info[0] < 3:
infile.next()
else :
next(infile)
for line in infile:
words = line.strip().split('\t')
if len(words)==2 :
ens, gene = words
if gene in gene2ens:
gene2ens[gene].append(ens)
else :
gene2ens[gene] = [ens]
return gene2ens
def read_gene_ensemble_conversion(file_name):
gene2ens = {} ; non_unique = []
with open(file_name,'r') as infile:
if sys.version_info[0] < 3:
infile.next()
else:
next(infile)
for line in infile:
words = line.strip().split('\t')
if len(words)==2 :
ens, gene = words
if gene in gene2ens:
non_unique.append((gene,ens,gene2ens[gene]))
else :
gene2ens[gene] = ens
if len(non_unique)>0:
print(' WARNING ' )
print( 'FOUND ', len(non_unique), ' NON UNIQUE ENTRIES' )
return gene2ens
def create_synonyms( convert_file , unique_mapping=False ):
# CREATE SYNONYMS
ens2sym , sym2ens = {} , {}
if unique_mapping:
sym2ens = read_gene_ensemble_conversion( convert_file )
ens2sym = { v:k for k,v in sym2ens.items() }
else :
sym2ens_list = read_conversions( convert_file )
ens2sym_list = {}
for s,L in sym2ens_list.items() :
for e in L:
if e in ens2sym_list:
ens2sym_list.append(s)
else:
ens2sym_list[e] = [s]
ens2sym = ens2sym_list
sym2ens = sym2ens_list
return ( ens2sym , sym2ens )
def flatten_dict( s2e ) :
# FORCES FIRST ELEMENT
ndict = {}
for (s,e) in s2e.items() :
if 'list' in str(type(e)) :
ndict[s] = e[0]
else :
ndict[s] = e
return ( ndict )
def convert_rdata_to_dataframe ( filename ) :
#
from rpy2.robjects import r as R
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
import rpy2.robjects as ro
#
print ( 'WARNING THIS PROGRAM NEED VALUE ERROR CHECKING' )
rd_ = R.load( filename )
if 'matrix' in str( type( R[rd_[0]] ) ).lower() :
column_names = [ R[rd_[0]].colnames ]
index_names = [ R[rd_[0]].rownames ]
else :
column_names = [ [r for r in _rd_.colnames] for _rd_ in R[rd_[0]]]
index_names = [ [r for r in _rd_.rownames] for _rd_ in R[rd_[0]]]
#
pandas2ri.activate()
#
# SMALL HELPER FUNCTION THAT TRANSFORMS A RDATA OBJECT INTO
# A PANDAS DATAFRAME. CURRENTLY THERE IS NO VALUE ERROR CHECKING
#
rd = R.load( filename )
raw_df_l = []
if 'ndarray' in str( type( R[rd[0]] ) ).lower() :
[ raw_df_l.append( R[rd[0]] ) ]
else :
[ raw_df_l.append( rdf ) for rdf in ro.vectors.DataFrame(R[rd[0]]) ]
full_df_dict = {} ; i_ = 0
for raw_df,colnames,rownames in zip( raw_df_l,column_names,index_names ) :
pdf = pd.DataFrame( raw_df , columns=colnames , index=rownames )
full_df_dict[i_] = pdf
i_ = i_ + 1
pandas2ri.deactivate()
return ( full_df_dict )
import os
if __name__ == '__main__' :
#
bMOFA_data = True
if bMOFA_data :
import os
# os.system('mkdir ../data')
# os.system('wget https://github.com/bioFAM/MOFAdata/blob/master/data/CLL_data.RData')
# os.system('mv CLL_data.RData ../data/.')
df_dict = convert_rdata_to_dataframe( filename = '../data/CLL_data.RData' )
pruned_df = df_dict[2].T.dropna().T
journal_df = df_dict[3].loc[['IGHV'],:]
mask = [ v>=0 for v in journal_df.loc['IGHV'].values ]
journal_df = journal_df.iloc[ :,mask ]
use = list(set(pruned_df.columns)&set(journal_df.columns))
analyte_df = pruned_df .loc[ :,use ].apply( lambda x:np.log2(1+x) )
journal_df = journal_df .loc[ :,use ]
print ( analyte_df,journal_df )
from impetuous.quantification import *
qdf = quantify_groups ( analyte_df , journal_df , 'anova~C(IGHV)' , '~/Work/data/naming_and_annotations/reactome/reactome.gmt' )
print(qdf)
exit(1)
base = '../../../data/'
convert_file = base + 'naming_and_annotations/conv.txt'
ens2sym , sym2ens = create_synonyms( convert_file )
s2e = { **flatten_dict(sym2ens) }
name,col,sep = 'Networks.nfo',0,'\t'
with open(name,'r') as input:
for line in input:
gname = line.split(sep)[col].replace('\n','')
if gname in s2e:
print(gname)
else:
print('MISSING:',gname)
"""
Copyright 2020 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import numpy as np
import pandas as pd
def demo3d ( R = None, colors = None , marker='$+$' ,
figure=1 , show=True, ax=None ) :
__desc__ = """
Scatterplot in 3D
"""
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
fig = plt.figure(figure)
if ax is None:
ax = fig.add_subplot(111, projection='3d')
if R is None :
print('demo3d :: NOTHING TO SHOW')
return
else :
ax.scatter(R[:,0],R[:,1], R[:,2], c=colors, marker=marker )
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
if show :
plt.show()
else :
return ax
def read_xyz(name='data/naj.xyz',header=2,sep=' '):
mol_str = pd.read_csv(name,header=header)
P=[]
for i_ in range(len(mol_str.index)):
line = mol_str.iloc[i_,:].values[0]
lsp = [l.replace(' ','') for l in line.split(sep) if len(l)>0]
P.append(lsp)
pdf = pd.DataFrame(P); pdf.index=pdf.iloc[:,0].values ; pdf=pdf.iloc[:,1:4]
return(pdf.apply(pd.to_numeric))
def KabschAlignment( P,Q ):
#
# https://en.wikipedia.org/wiki/Kabsch_algorithm
# C++ VERSION: https://github.com/richardtjornhammar/RichTools/blob/master/src/richfit.cc
# IN VINCINITY OF LINE 524
#
N,DIM = np.shape( P )
M,DIM = np.shape( Q )
if DIM>N or not N==M :
print( 'MALFORMED COORDINATE PROBLEM' )
exit( 1 )
q0 , p0 = np.mean(Q,0) , np.mean(P,0)
cQ , cP = Q - q0 , P - p0
H = np.dot(cP.T,cQ)
I = np.eye( DIM )
U, S, VT = np.linalg.svd( H, full_matrices=False )
Ut = np.dot( VT.T,U.T )
I[DIM-1,DIM-1] = 2*(np.linalg.det(Ut) > 0)-1
ROT = np.dot( VT.T,np.dot(I,U.T) )
B = np.dot(ROT,P.T).T + q0 - np.dot(ROT,p0)
return ( B )
def WeightsAndScoresOf( P , bFA=False ) :
p0 = np.mean( P,0 )
U, S, VT = np.linalg.svd( P-p0 , full_matrices=False )
weights = U
if bFA :
scores = np.dot(S,VT).T
return ( weights , scores )
scores = VT.T
return ( weights , scores )
def ShapeAlignment( P, Q ,
bReturnTransform = False ,
bShiftModel = True ,
bUnrestricted = False ) :
#
# [*] C++ VERSION: https://github.com/richardtjornhammar/RichTools/blob/master/src/richfit.cc
# FIND SHAPE FIT FOR A SIMILIAR CODE IN THE RICHFIT REPO
#
description = """
A NAIVE SHAPE FIT PROCEDURE TO WHICH MORE SOPHISTICATED
VERSIONS WRITTEN IN C++ CAN BE FOUND IN MY C++[*] REPO
HERE WE WORK UNDER THE ASSUMPTION THAT Q IS THE MODEL
SO THAT WE SHOULD HAVE SIZE Q < SIZE P WITH UNKNOWN
ORDERING AND THAT THEY SHARE A COMMON SECOND DIMENSION
IN THIS ROUTINE THE COARSE GRAINED DATA ( THE MODEL ) IS
MOVED TO FIT THE FINE GRAINED DATA ( THE DATA )
"""
N,DIM = np.shape( P )
M,DIM = np.shape( Q )
W = (N<M)*N+(N>=M)*M
if (DIM>W or N<M) and not bUnrestricted :
print ( 'MALFORMED PROBLEM' )
print ( description )
exit ( 1 )
q0 , p0 = np.mean(Q,0) , np.mean(P,0)
cQ , cP = Q - q0 , P - p0
sQ = np.dot( cQ.T,cQ )
sP = np.dot( cP.T,cP )
H = np.dot(sP.T,sQ)
I = np.eye( DIM )
U, S, VT = np.linalg.svd( H, full_matrices=False )
Ut = np.dot( VT.T,U.T )
I[DIM-1,DIM-1] = 2*(np.linalg.det(Ut) > 0)-1
ROT = np.dot( VT.T,np.dot(I,U.T) )
if bReturnTransform :
return ( ROT,q0,p0 )
if bShiftModel :# SHIFT THE COARSE GRAINED DATA
B = np.dot(ROT,Q.T).T +p0 - np.dot(ROT,q0)
else : # SHIFT THE FINE GRAINED DATA
B = np.dot(ROT,P.T).T +q0 - np.dot(ROT,p0)
return ( B )
def low_missing_value_imputation ( fdf , fraction = 0.9 , absolute = 'True' ) :
# THIS SVD BASED IMPUTATION METHOD WAS FIRST WRITTEN FOR THE RANKOR PACKAGE
# ORIGINAL CODE IN https://github.com/richardtjornhammar/rankor/blob/master/src/rankor/imputation.py
#
import numpy as np
#
# fdf is a dataframe with NaN values
# fraction is the fraction of information that should be kept
# absolute is used if the data is positive
#
V = fdf.apply(pd.to_numeric).fillna(0).values
u,s,vt = np.linalg.svd(V,full_matrices=False)
s = np.array( [ s[i_] if i_<np.floor(len(s)*fraction) else 0 for i_ in range(len(s)) ] )
nan_values = np.dot(np.dot(u,np.diag(s)),vt)
if absolute :
nan_values = np.abs(nan_values)
#
# THIS CAN BE DONE BETTER
for j in range(len(fdf.columns.values)):
for i in range(len(fdf.index.values)):
if 'nan' in str(fdf.iloc[i,j]).lower():
fdf.iloc[i,j] = nan_values[i,j]
return ( fdf )
if __name__ == '__main__' :
#
# IF YOU REQUIRE THE DATA THEN LOOK IN :
# https://github.com/richardtjornhammar/RichTools
# WHERE YOU CAN FIND THE FILES USED HERE
#
if False :
colors = {'H':'#777777','C':'#00FF00','N':'#FF00FF','O':'#FF0000','P':'#FAFAFA'}
Q = read_xyz( name='../data/naj.xyz' , header=2 , sep=' ' )
if False : # TEST KABSCH ALGORITHM
P = Q .copy()
Q = Q * -1
Q = Q + np.random.rand(Q.size).reshape(np.shape(Q.values))
if True :
ax = demo3d ( R=P.values, colors=[colors[c_] for c_ in P.index.values ] ,
figure=1 , show=False, marker='$P$' )
ax = demo3d ( R=Q.values, colors=[colors[c_] for c_ in Q.index.values ] ,
figure=1 , show=False, marker='$Q$', ax=ax )
P_ , Q_ = P.copy() , Q.copy()
P = P_.values
Q = Q_.values
B = KabschAlignment( P,Q )
B = pd.DataFrame( B , index = P_.index.values )
if True :
ax = demo3d ( R=B.values, colors = [ "#000000" for c_ in B.index.values ] ,
figure=1 , show=True, marker='$R$', ax=ax )
if False : # TEST MY SHAPE ALGORITHM
P = read_xyz ( name='data/cloud.xyz' , header=2 , sep='\t' )
P_ , Q_= P.values,Q.values
B = ShapeAlignment( P_,Q_ )
ax = demo3d ( R=Q.values, colors=[colors[c_] for c_ in Q.index.values ] ,
figure=1 , show=False, marker='$Q$' )
ax = demo3d ( R=P.values, colors=[colors[c_] for c_ in P.index.values ] ,
figure=1 , show=False, marker='$P$', ax=ax )
ax = demo3d ( R=B, colors=["#0011BB" for c_ in range(len(B)) ] ,
figure=1 , show=True, marker='$B$', ax=ax )
"""
Copyright 2019 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import pandas as pd
import numpy as np
from impetuous.quantification import group_significance
from impetuous.convert import *
def pathway_frame_from_file ( filename ,
delimiter = '\t' , item_sep = ',' ) :
pdf = None
with open( filename,'r' ) as input :
for line in input :
lspl = line.replace('\n','').split(delimiter)
analytes_ = lspl[2:]
desc = lspl[1]
iden = lspl[0]
ps = pd.Series( [desc , item_sep.join(analytes_) ] ,
name = iden , index = ['description','analytes'] )
pdf = pd.concat([pdf,pd.DataFrame(ps).T])
return ( pdf )
def create_dag_representation_df ( pathway_file = '../data/GROUPDEFINITIONS.gmt' ,
pcfile = '../data/PCLIST.txt'
) :
pc_list_file = pcfile
tree , ance , desc = parent_child_to_dag ( pc_list_file )
pdf = make_pathway_ancestor_data_frame ( ance )
pdf_ = pathway_frame_from_file( pathway_file )
pdf.index = [v.replace(' ','') for v in pdf.index.values]
pdf_.index= [v.replace(' ','') for v in pdf_.index.values]
dag_df = pd.concat([pdf.T,pdf_.T]).T
return ( dag_df , tree )
def HierarchalEnrichment (
analyte_df , dag_df , dag_level_label = 'DAG,l' ,
ancestors_id_label = 'aid' , id_name = None , threshold = 0.05 ,
p_label = 'C(Status),p', analyte_name_label = 'analytes' ,
item_delimiter = ',' , alexa_elim=False , alternative = 'two-sided'
) :
#
# NEEDS AN ANALYTE SIGNIFICANCE FRAME:
# INCLUDING P VALUES OF ANALYTES
# DAG GRAPH DESCRIPTION FRAME:
# INCLUDING NODE ID, NODE ANALYTES FIELD (SEPERATED BY ITEM DELIMITER)
# INCLUDING ANCESTORS FIELD (SEPERATED BY ITEM DELIMITER)
# DAG LEVEL OF EACH NODE
tolerance = threshold
df = dag_df ; dag_depth = np.max( df[dag_level_label].values )
AllAnalytes = set( analyte_df.index.values ) ; nidx = len( AllAnalytes )
SigAnalytes = set( analyte_df.iloc[ (analyte_df.loc[:,p_label].values < tolerance), : ].index.values )
if len( AllAnalytes ) == len( SigAnalytes ) :
print ( 'THIS STATISTICAL TEST WILL BE NONSENSE' )
print ( 'TRY A DIFFERENT THRESHOLD' )
marked_analytes = {} ; used_analytes = {} ; node_sig = {}
for d in range( dag_depth, 0, -1 ) :
# ROOT IS NOT INCLUDED
filter_ = df [ dag_level_label ] == d
nodes = df.iloc[ [i for i in np.where(filter_)[ 0 ]] ,:].index.values
for node in nodes :
if 'nan' in str(df.loc[node,analyte_name_label]).lower() :
continue
analytes_ = df.loc[node,analyte_name_label].replace('\n','').replace(' ','').split(item_delimiter)
try :
group = analyte_df.loc[[a for a in analytes_ if a in AllAnalytes] ].dropna( axis=0, how='any', thresh=analyte_df.shape[1]/2 ).drop_duplicates()
except KeyError as e :
continue
if node in marked_analytes :
unused_group = group.loc[ list( set(group.index.values)-marked_analytes[node] ) ]
group = unused_group
L_ = len( group ) ; str_analytes=','.join(group.index.values)
if L_ > 0 :
used_analytes[node] = ','.join( group.index.values )
pv,odds = group_significance( group , AllAnalytes=AllAnalytes, SigAnalytes=SigAnalytes , tolerance = threshold , alternative=alternative )
node_sig[node] = pv ; marked_ = set( group.index.values )
ancestors = df.loc[node,ancestors_id_label].replace('\n','').replace(' ','').split(item_delimiter)
if alexa_elim and pv > threshold : # USE ALEXAS ELIM ALGORITHM : IS NOT DEFAULT
continue
for u in ancestors :
if u in marked_analytes :
us = marked_analytes[u]
marked_analytes[u] = us | marked_
else :
marked_analytes[u] = marked_
df['Hierarchal,p'] = [ node_sig[idx] if idx in node_sig else 1. for idx in df.index.values ]
df['Included analytes,ids'] = [ used_analytes[idx] if idx in used_analytes else '' for idx in df.index.values ]
df = df.dropna()
return ( df )
"""
Copyright 2020 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from impetuous.clustering import *
from impetuous.quantification import *
import sys
if __name__ == '__main__' :
inventors__ = "Richard Tjörnhammar and Edward Tjörnhammar"
"""
Copyright 2019 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import pandas as pd
import numpy as np
import re
from impetuous.convert import read_conversions, read_gene_ensemble_conversion, create_synonyms
def find_compartment( istr ) :
return ( re.findall( r'\[(.*?)\]', istr ) )
class Group( object ) :
def get_next(self):
pass
## PATHWAY SECTION
class GenericPathway( Group ) :
def __init__( self , path , gene_name_start = "ENSG0" , gene_mapping = None, # GENE ID MAPPING
is_own_pathway = False, list_like = False , add_pathway_prefix='',
gene_pos=0 , pathway_pos=1 , pathway_desc_pos=3, seperator='\t' ) :
self.file = path
self.prefered_genes = gene_name_start
self.pathways , self.pathway_names = {},{}
self.sep = seperator
self.is_own_pathway = is_own_pathway
self.gene_mapping = gene_mapping
self.gene_pos = gene_pos
self.pathway_pos = pathway_pos
self.pathway_desc_pos = pathway_desc_pos
self.list_like = list_like
self.add_pathway_prefix = add_pathway_prefix
self.replace_pair = None
self.internal_table = None
self.generator = None
self.active_read = False
self.restrict_id_to = None
if 'str' in str(type(self.file)):
self.read_and_parse()
else :
self.parse_df()
def add_pathway_synonyms ( self , synonym_dict , prefix='' ) :
for pathway,pathway_name,genes in self.get_generator() :
synonyms = []
for g in genes :
if g in synonym_dict or prefix+g in synonym_dict :
k = prefix * ( not g in synonym_dict ) + g
sdg = synonym_dict[k]
if 'list' in str(type(sdg)) :
[ synonyms.append(s) for s in sdg ]
if 'str' in str(type(sdg)) :
synonyms.append(sdg)
[ self.pathways[pathway].append(s) for s in synonyms ]
def make_gmt_pathway_file ( self , filename , verbose=False , delimiter='\t', gene_mapping=None ):
#if 'None' in str(type( self.generator )) :
# self.generator = self.get_generator()
if not gene_mapping is None:
self.gene_mapping = gene_mapping
if 'str' in str(type(filename)) :
if 'str' in str( type( filename ) ) :
with open( filename, 'w' ) as o_file :
for pathway, pathway_name, genes in self.get_generator() :
row = list() ; row . append ( pathway )
row . append ( pathway_name )
genes_loc = genes
if 'dict' in str(type(self.gene_mapping)) :
genes_loc = [ self.gene_mapping[gene] if gene in self.gene_mapping.keys() else gene for gene in genes ]
[ row.append ( gene ) for gene in list(set(genes_loc)) ]
row_str = delimiter.join(row)
print(row_str)
o_file . write( row_str+'\n' )
else :
print ( 'MUST SUPPLY A FILENAME' )
def make_internal_table(self, verbose=False, delimiter='\t', output_file=None ) :
self.internal_table = list()
generator = self.get_generator( )
for pathway, pathway_name, genes in generator :
row = list() ; row . append ( pathway )
row .append ( pathway_name )
[ row .append ( gene ) for gene in genes ]
row_str = delimiter.join(row)
self.internal_table.append( row )
def parse_df(self):
if not self.is_own_pathway:
print('ERROR: OPERATION NOT SUPPORTED')
exit(1)
sfcv = set( self.file.columns.values )
if 'gene' in sfcv and 'pathway' in sfcv and 'description' in sfcv:
print ( self.file.columns )
else:
genes = self.file.index.values
for gene in genes :
pathway = self.add_pathway_prefix + gene
self.pathways[pathway] = [gene]
if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys() :
self.pathway_names[pathway] = self.gene_mapping[gene]
else :
self.pathway_names[pathway] = gene
def get_generator_from_df(self):
self.parse_df()
for key in self.pathways.keys():
yield(key,self.pathway_names[key],self.pathways[key])
def read_and_parse(self):
with open(self.file) as input:
pathway, pathway_name, genes = "", "", []
for line in input:
lspl = line.split('\t')
if not 'None' in str(type(self.replace_pair)):
pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] ).replace( self.replace_pair[0],self.replace_pair[1] )
else:
pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] )
pathway_name = lspl[self.pathway_desc_pos]
if self.list_like :
# LIST LIKE PATHWAY INVENTORY CANNOT HAVE SELF MAPPING
# HERE WE ASSUME ALL GENES FOLLOW THE FIRST GENE_POS
genes = [ lspl[ir].replace('\n','') for ir in range(self.gene_pos,len(lspl)) ]
if not 'None' in str(type(self.gene_mapping)) :
renamed_genes = [ self.gene_mapping[gene] if gene in self.gene_mapping.keys() else gene for gene in genes ]
genes = renamed_genes
if pathway in self.pathways :
[ self.pathways[pathway].append(gene) for gene in genes ]
else :
self.pathways[pathway] = genes
self.pathway_names[pathway] = pathway_name
else :
if not line.startswith(self.prefered_genes) or len(line)==0:
continue
gene = lspl[ self.gene_pos ]
if self.is_own_pathway :
if gene in self.pathways :
continue;
else:
pway=self.add_pathway_prefix + gene
self.pathways[pway] = [gene]
if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys():
self.pathway_names[pway] = self.gene_mapping[gene]
else:
self.pathway_names[pway] = gene
else :
if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys():
gene = self.gene_mapping[gene]
if pathway in self.pathways:
self.pathways[pathway].append(gene)
else:
self.pathways[pathway] = [gene]
self.pathway_names[pathway] = pathway_name
def dump_pathways(self):
return ( self.pathways,self.pathway_names )
def get_generator( self, verbose=False ):
if self.active_read :
if not self.file is None :
if 'str' in str(type(self.file)):
self.read_and_parse()
else:
self.parse_df()
if verbose :
print( self.dump_pathways() )
for key in self.pathways.keys():
yield( key , self.pathway_names[key] , self.pathways[key] )
class Reactome( GenericPathway ) :
def __init__( self , path , gene_name_start = 'ENSG0' ,pathway_desc_pos=3,
gene_mapping = None, lexical_restrictions = None, # GENE ID MAPPING
is_own_pathway = False, restrict_id_to = None ) :
self.file = path
self.prefered_genes = gene_name_start
self.pathways , self.pathway_names ,self.pathway_compartments = {},{},{}
self.is_own_pathway = is_own_pathway
self.gene_mapping = gene_mapping
self.gene_pos = 0
self.pathway_pos = 1
self.pathway_desc_pos = pathway_desc_pos
self.list_like = False
self.add_pathway_prefix = ''
self.replace_pair = None
self.lexical_restrictions = lexical_restrictions
self.lexical_restriction_category_pos=None
self.skipstr = None
self.pathway_tag = None
self.restrict_id_to = restrict_id_to
self.active_read = False
if 'str' in str(type(self.file)):
self .read_and_parse(keep_str='ENSG0')
else:
self .parse_df()
def read_and_parse( self , keep_str=None ) :
with open( self.file ) as input :
pathway, pathway_name, genes = "", "", []
for line in input :
if not self.skipstr is None :
if line.startswith(self.skipstr):
continue
if not keep_str is None :
if not line.startswith(keep_str):
continue
lspl = line.split('\t')
if ('[' in line) and (']' in line) :
compartment = find_compartment( line )[0]
else :
compartment = ''
if not 'None' in str(type(self.replace_pair)) :
pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] ).replace( self.replace_pair[0],self.replace_pair[1] )
else :
pathway = ( self.add_pathway_prefix + lspl[self.pathway_pos] )
if not self.restrict_id_to is None :
if not pathway in self.restrict_id_to:
continue
pathway_name = lspl[ self.pathway_desc_pos ]
if not self.lexical_restrictions is None:
if not np.sum( [ int(lr in pathway_name) for lr in self.lexical_restrictions] )>0 : #or len(compartment)<4
continue
if self.lexical_restriction_category_pos is None :
lex_restrict = pathway_name
else :
lex_restrict = lspl[ self.lexical_restriction_category_pos ]
if self.list_like :
# LIST LIKE PATHWAY INVENTORY CANNOT HAVE SELF MAPPING
# HERE WE ASSUME ALL GENES FOLLOW THE FIRST GENE_POS
genes = [ lspl[ir].replace('\n','') for ir in range(self.gene_pos,len(lspl)) ]
if not 'None' in str(type(self.gene_mapping)) :
renamed_genes = [ self.gene_mapping[gene] if gene in self.gene_mapping.keys() else gene for gene in genes ]
genes = renamed_genes
if pathway in self.pathways :
[ self.pathways[pathway].append(gene) for gene in genes ]
else :
self.pathways[pathway] = genes
self.pathway_names[pathway] = pathway_name
self.pathway_compartments[pathway] = compartment
else :
if not line.startswith(self.prefered_genes) or len(line)==0:
continue
gene = lspl[ self.gene_pos ]
if self.is_own_pathway :
if gene in self.pathways :
continue;
else:
pway = self.add_pathway_prefix + gene
self.pathways[pway] = [gene]
if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys():
self.pathway_names[pway] = self.gene_mapping[gene]
else:
self.pathway_names[pway] = gene
else :
if not self.pathway_tag is None:
if not self.pathway_tag in pathway:
continue
if not 'None' in str(type(self.gene_mapping)) and gene in self.gene_mapping.keys():
gene = self.gene_mapping[gene]
if pathway in self.pathways:
self.pathways[pathway].append(gene)
else :
if True : # self.lexical_restrictions is None :
self.pathways[pathway] = [gene]
self.pathway_names[pathway] = pathway_name
else :
if len(set(self.lexical_restrictions)-set(lex_restrict.split()))<len(self.lexical_restrictions):
if len( set(self.lexical_restrictions) & set(lex_restrict.split()) )>0:
self.pathways[pathway] = [gene]
self.pathway_names[pathway] = pathway_name
def add_pathway_synonyms( self , synonym_dict, prefix='' ) :
for pathway, pathway_name, genes in self.get_generator() :
synonyms = []
for g in genes:
if g in synonym_dict or prefix+g in synonym_dict :
k = prefix * ( not g in synonym_dict ) + g
sdg = synonym_dict[ k ]
if 'list' in str(type(sdg)) :
[ synonyms.append(s) for s in sdg ]
if 'str' in str(type(sdg)) :
synonyms.append(sdg)
[ self.pathways[pathway].append(s) for s in synonyms ]
def read_extra_information ( self , file_name , valpos = 0 , keypos = 1 ,
add_value_prefix = '' , required_content = 'HSA' ,
add_desc_pos = None , comp_pos = None ) :
with open ( file_name ) as input :
pathway, genes = "", []
for line in input :
lspl = line.split( '\t' )
pw = lspl[ keypos ]
if required_content in pw :
gn = add_value_prefix + lspl[ valpos ]
if not self.restrict_id_to is None :
if not pw in self.restrict_id_to :
continue
if pw in self.pathways :
self.pathways[pw] .append(gn)
else :
self.pathways[pw] = [gn]
self.pathway_names[ pw ] = ''
if not add_desc_pos is None :
if lspl[add_desc_pos] not in self.pathway_names[pw] :
self.pathway_names[pw] += lspl[add_desc_pos] + ', '
if not comp_pos is None :
if self.pathway_compartments is None :
self.pathway_compartments = {}
if pw not in self.pathway_compartments :
self.pathway_compartments[pw] = [ ''.join(find_compartment(lspl[comp_pos])) ]
else :
self.pathway_compartments[pw] .append( ''.join(find_compartment(lspl[comp_pos])) )
def add_extra_information_from_dictionary( self , dictionary, map_onto_genes=True ):
for pathway in self.pathways :
genes = self.pathways[pathway]
add_these = []
[ [ add_these.append(v) for v in dictionary[g] ] for g in genes if g in dictionary.keys() ]
if len(add_these)>0:
[ self.pathways[pathway].append(a) for a in add_these]
def print_generator( generator , show_max_nr=3 , find_str=None ):
for pathway,pathway_name,genes in generator:
if not find_str is None:
if not find_str in pathway:
continue
print('Pathway: ' , pathway )
print('Pathway name: ', pathway_name )
print('Gene amount : ', len(genes), ' \t Gene name of first: ', genes[0] )
print('Gene dump : ', genes )
show_max_nr-=1
if show_max_nr == 0:
break;
def create_listdictionary_from_file( filename , delimiter = '\t' ,add_key_prefix='' ):
wanted_dictionary = dict()
with open( filename ) as input :
value, keyval, descriptor = "", "", []
for line in input:
all_vals = line.replace('\n','').replace(' ','').split( delimiter )
keyv = add_key_prefix+all_vals[0]
valv = all_vals[1:]
wanted_dictionary[keyv] = valv
return ( wanted_dictionary )
def flatten_generator(pathway_generator_object):
for pathway,genes in pathway_generator_object.pathways.items() :
ngenes = []
[ ngenes.append(g) if 'str' in str(type(g)) else [ ngenes.append(q) for q in g ] for g in genes ]
pathway_generator_object.pathways[pathway] = list(set(ngenes))
if pathway in pathway_generator_object.pathway_compartments:
pathway_generator_object.pathway_names[pathway] = '[' + \
','.join(list(set( pathway_generator_object.pathway_compartments[pathway] )) ) + \
'] ' + pathway_generator_object.pathway_names[pathway]
if __name__ == '__main__' :
print('ADD BACK REACTOME TEST')
此差异已折叠。
"""
Copyright 2020 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
#
# MOVED OVER FROM RANKOR (reducer.py) AS OF COMMIT :
# https://github.com/richardtjornhammar/rankor/commit/f04a83091a92130b9b73439b4ad65b5be3056cf9
#
import pandas as pd
import numpy as np
from scipy.stats import rankdata
from impetuous.quantification import qvalues, permuter
#
# REGULAR CONTRAST
contrast = lambda A,B : ( A-B )/( A+B )
#
# OTHER
e_flatness = lambda x : np.exp(np.mean(np.log(x),0))/np.mean(x,0)
e_contrast = lambda x : 1 - e_flatness(x)
pi0 = lambda pvs : 1.
def padded_rolling_average( tv , tau ) :
# AS OF THIS PANDAS VERSION ( 1.1.0 )
# WINDOW CALCULATION WAS NOT PADDED SO
# THIS IS A NUMPY ONLY IMPLEMENTATION
if tau==1 :
return ( tv )
if len(tv)<tau :
return ( [ np.mean(v_) for v_ in tv ] )
centered = lambda x:(np.min(x),np.max(x)) ; N=len(tv);
w = int(np.floor(np.abs(tau)*0.5)) ; idx = [ centered( [int((i-w)>0)*(i-w)%N,i,int(i+w<N)*(i+w)%N]) for i in range(N) ]
mvalues = [ np.mean(tv[i[0]:i[1]]) for i in idx ]
return ( mvalues )
def svd_reduced_mean ( x,axis=0,keep=[0] ) :
if True :
sk = set ( keep )
if len ( np.shape(x) ) > 1 :
u , s , vt = np .linalg .svd( x , full_matrices=False )
xred = np.mean( np.dot(u*[s[i_] if i_ in sk else 0 for i_ in range(len(s))],vt) , axis)
if 'pandas' in str(type(x)) :
if not 'series' in str(type(x)) :
xname = x.index.values[0]
return ( pd.DataFrame( [xred] , index=[xname] , columns=x.columns ) )
else :
xname = x.name
return ( pd.Series( xred , name=xname , index=x.columns ) )
else :
return ( xred )
return ( x )
from sklearn.decomposition import PCA
dimred = PCA ( n_components = 1 )
def pca_reduced_mean( x ) :
if True :
if len ( np.shape(x) ) > 1 :
Xnew = dimred.fit_transform( x.T )
xred = Xnew . T [0] + np.mean(np.mean(x))
if 'pandas' in str(type(x)) :
if not 'series' in str(type(x)) :
xname = x.index.values[0]
return ( pd.DataFrame( [xred] , index=[xname] , columns=x.columns ) )
else :
xname = x.name
return ( pd.Series( xred , name=xname , index=x.columns ) )
return ( x )
def reduction ( a , power , centered=-1 ) :
if centered>0 :
a = ( a.T-np.mean(a,1) ).T
return( np.linalg.svd ( a**power , full_matrices=False ) )
def hyper_params ( df_ , label = 'generic' , sep = ',' , power=1., centered=-1 ):
#
idx_ = df_.index.values
N_s = len ( df_.columns )
u,s,vt = reduction( df_.values , power , centered=centered )
rdf_ = pd.Series ( np.sum(u**2,1) , index=idx_ , name = label+sep+"u" )
rdf_ = pd.concat ( [ pd.DataFrame(rdf_) ,
pd.DataFrame( pd.Series( np.mean( df_.values,1 ) ,
index = idx_ , name=label+sep+"m") ) ] , axis=1 )
w_ = rdf_ .loc[ :,label+sep+"u" ].values
r_ = rankdata ( [ w for w in w_ ] ,'average' )
N = len ( r_ )
#
df0_ = pd.DataFrame( [ [a for a in range(N)],w_,r_ ],index=['idx','w_','r_'], columns=idx_ ).T
#
from scipy.special import erf as erf_
loc_pval = lambda X , mean , stdev : [ 1. - 0.5*( 1. + erf_( ( x - mean )/stdev/np.sqrt(2.) ) ) for x in X ]
lvs = np.log( df0_.loc[ :,'w_'].values )
#
return ( np.mean(lvs) , np.std(lvs) )
def hyper_rdf ( df_ , label = 'generic' , sep = ',' , power=1. ,
diagnostic_output = False , MEAN=None , STD=None , centered=-1 ) :
#
idx_= df_.index.values
N_s = len ( df_.columns )
u,s,vt = reduction ( df_.values , power , centered=centered )
rdf_ = pd.Series ( np.sum( ( u )**2,1 ) , index=idx_ , name = label+sep+"u" )
rdf_ = pd.concat ( [ pd.DataFrame( rdf_ ) ,
pd.DataFrame( pd.Series( np.mean( df_.values,1 ) ,
index = idx_ , name=label+sep+"m") ) ] , axis=1 )
w_ = rdf_ .loc[ :,label+sep+"u" ].values
r_ = rankdata ( [ w for w in w_] ,'average' )
N = len ( r_ )
#
# HERE WE CONSTRUCT A DIANGOSTICS AND INTERMITTENT CALCULATION
# DATAFRAME FOR EASIER DEBUGGING AND INSPECTION
df0_ = pd.DataFrame( [ [a for a in range(N)],w_,r_ ],index=['idx','w_','r_'], columns=idx_ )
df0_ = df0_.T.sort_values ( by='r_' )
df0_ .loc[ : , 'd_' ] = [ v for v in ( df0_.loc[:, 'w_' ] * df0_.loc[:,'r_'] ) ]
df0_ .loc[ : , 'da_' ] = np.cumsum ( df0_ .loc[ : , 'd_' ].values )
#
# HOW MANY EQUALLY OR MORE EXTREME VALUES ARE THERE? ( RANK BASED )
df0_ .loc[ : , 'dt_' ] = np.cumsum ( df0_ .loc[ : , 'd_' ].values[::-1] )[::-1]
df0_ .loc[ : , 'rank_ps' ] = df0_ .loc[ :,'dt_' ] / np.sum( df0_ .loc[ :,'d_' ] )
#
# DRAW HISTOGRAM TO SEE DISTRIBUTION OF THE DISTANCES
from scipy.special import erf as erf_
loc_pval = lambda X , mean , stdev : [ 1. - 0.5*( 1. + erf_( ( x - mean )/stdev/np.sqrt(2.) ) ) for x in X ]
lvs = np.log( df0_.loc[ :,'w_'].values )
if MEAN is None or STD is None:
if len(lvs)>3 :
ps = loc_pval( lvs , np.mean( lvs ) , np.std( lvs ) )
else :
ps = df0_ .loc[ : , 'rank_ps' ]
else :
ps = loc_pval( lvs , MEAN , STD )
#
if diagnostic_output :
import scipy.stats as scs
NB = 100
lv = np.log( rdf_.loc[:,[label+sep+"u"]].values )
y , x = np.histogram( lv , bins=NB , density=True )
skw = scs.skew(lv)[0]
kur = scs.kurtosis(lv)[0]
shape_stats = "kurtosis: " + "{:.2f} ".format( kur ) + "skewness: "+ "{:.2f}".format( skw )
locd = lambda x,M,s : (1./s/np.sqrt(2.*np.pi))*np.exp(-0.5*((x-M)/s)**2 )
lin_x = 0.5 * ( x[1:] + x[:-1] )
his_y = y
rsy = sorted( [ (y_,x_) for (y_,x_) in zip(y,lin_x) ] )[::-1]
hm , hs = np.mean([rsy[i][1] for i in range(5)]) , np.mean([rsy[i][0] for i in range(5)])
h_mod_y = locd( lin_x , hm , 1.0/(hs*np.sqrt(2.*np.pi)) )
d_mod_y = locd( lv , np.mean(lv) , np.std(lv) )
rem_y = [ (y_-m_) for (y_,m_) in zip(y, locd(0.5*(x[1:]+x[:-1]),np.mean(lv),np.std(lv))) ]
prc_y = [ 100.*np.abs( contrast(y_,m_) ) for (y_,m_) in zip(y, locd(0.5*(x[1:]+x[:-1]),np.mean(lv),np.std(lv))) ]
RMSD = np.sqrt(np.sum([ ry**2 for ry in rem_y ]))
PMAE = np.mean(prc_y)
#
df0_ .loc[ : , 'pvalues' ] = ps
#
# ASSIGN THE P VALUES
rdf_.loc[df0_.index.values,label+sep+"p"] = df0_.loc[:,'pvalues']
rdf_.loc[df0_.index.values,label+sep+"q"] = [ qvs[0] for qvs in qvalues( df0_.loc[:,'pvalues'].values , pi0 = pi0(df0_.loc[:,'pvalues'].values) ) ]
rdf_.loc[df0_.index.values,label+sep+"r"] = df0_ .loc[ : , 'rank_ps' ]
#
# AND RETURN THE RESULTS
if diagnostic_output :
return ( rdf_ , RMSD , PMAE , kur , skw , rem_y )
else :
return ( rdf_ )
if __name__ == '__main__' :
#
print ( 'REDUCER :: TESTS ' )
#
a = 2*np.random.rand(10)
b = 4*np.random.rand(10)
X = [ [*(a[:5]+1),*a[5:]],[*(b[:5]+3),*(b[5:])] ]
Xdf = pd.DataFrame( X , columns=['a','b','s','c','d','e','f','g','h','i'] , index=['i0','i1'])
#
print ( Xdf )
print ( svd_reduced_mean ( X ) )
print ( svd_reduced_mean ( Xdf ) )
v0 = [1.9,1.8,2.1,1.1,8.,1.2,2.2,3.5,2.0,2.0,3.1,2.1,2.9]
a2 = np.array([[2,2,2,1,1,1,2,3,2,2,3,2,3],v0])
NW = 100
if True:
for i in range(NW):
for w in range(1,i-1):
dat = np.random.rand(i)
pra = padded_rolling_average( dat,tau=w )
str_pra = ''.join([ str(p) for p in pra ])
if 'nan' in str_pra:
print ( pra,len(pra),'[',i,w ,']')
else:
if not len(pra)==len(dat):
print ( len(pra)==len(dat),'[',i,w ,']',len(pra),len(dat) )
print ( padded_rolling_average( np.array( svd_reduced_mean( Xdf ).values ).reshape(-1,1) ,tau=4 ) )
print ( padded_rolling_average( np.array( svd_reduced_mean( Xdf ).values ).reshape(1,-1) ,tau=4 ) )
print ( padded_rolling_average( np.array( svd_reduced_mean( Xdf ).values ) ,tau=4 ) )
print ( padded_rolling_average( v0,tau=4 ) )
print ( v0 )
"""
Copyright 2020 RICHARD TJÖRNHAMMAR
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import pandas as pd
import numpy as np
from scipy.stats import rankdata
beta2M = lambda b : math.log2( (b/(1-b)) )
M2beta = lambda M : 2**M/(2**M+1)
#
# FRACTIONAL RANKS ON [0,1] ARE RECAST TO +- INFINITY
map_df_to_spectral_order_domain = lambda df: \
df .apply ( lambda x:(rankdata(x,'average')-0.5)/len(x) ) \
.apply ( lambda B:[ beta2M(b) for b in B] )
power = lambda s : np.dot( s.T,np.conj(s) )
#
# THE SPECTRUM IS TRANSFORMED INTO A POWER SPECTRUM
spectre_to_power = lambda sdf: \
pd.DataFrame ( \
power ( sdf.T.apply(np.fft.fft) ) , \
index = sdf.index , columns = sdf.index \
)
#
# THE BELOW METHOD IS HIGHLY EXPERIMENTAL
def transform_to_resonances( spectrum ) :
# assumes spectrum is a square symmetric matrix
f_ls = np.mean( np.abs(spectrum.iloc[0,:].quantile([0.01,0.99])) )
reso = spectrum .apply(lambda ser: np.real(ser.to_numpy())) \
.apply(lambda X:[x/f_ls for x in X]) .apply(M2beta) \
.apply(lambda X:[ 2.*x-1. for x in X] )
return ( reso )
#
# TIME AUTOCORRELATIONS ARE NOT THE SAME THING
# AS PEARSON AUTOCORRELATIONS
def calc_autocorrelation( tv , dt=1 ,bMeanCentered=True ) :
# If you studied statistical mechanics you would
# know about the Wiener Kinchin theorem
if bMeanCentered :
# So for stocks you might want
# to remove the mean
tv-=np.mean(tv)
ftv = np.fft.fft(tv)
rt = np.fft.ifft(ftv*np.conj(ftv))
rt = rt/rt[0]
rt = rt[:int(np.floor(len(rt)*0.5))]
return( [dt*t for t in range(len(rt))], [np.real(r) for r in rt] )
if __name__ == '__main__' :
print ( 'SORRY: NO TESTS HERE' )
print ( 'DEVELOPMENTAL VERSION')
Metadata-Version: 2.1
Name: impetuous-gfa
Version: 0.21.1
Summary: Impetuous Quantification, Alignments, Enrichments and Group Analysis
Home-page: https://github.com/richardtjornhammar/impetuous
Author: Richard Tjörnhammar
Author-email: richard.tjornhammar@gmail.com
License: UNKNOWN
Description: # Simple Group Analysis
Decomposes a set of expressions into a group expression.
The toolkit currently offers enrichment analysis, hierarchical
enrichment analysis, PLS regression, Shape alignment or clustering
as well as rudimentary factor analysis.
The expression regulation can be studied via a statistical
test that relates it to the observables in the journal file.
The final p values are then FDR corrected and the resulting
adjusted p values are produced.
Visit the active code via :
https://github.com/richardtjornhammar/impetuous
Visit the published code :
https://doi.org/10.5281/zenodo.2594690
Cite using :
DOI: 10.5281/zenodo.2594690
Install with :
pip install impetuous-gfa
Platform: UNKNOWN
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Operating System :: OS Independent
Description-Content-Type: text/markdown
LICENSE
Makefile
README.md
requirements.txt
setup.py
impetuous_gfa.egg-info/PKG-INFO
impetuous_gfa.egg-info/SOURCES.txt
impetuous_gfa.egg-info/dependency_links.txt
impetuous_gfa.egg-info/top_level.txt
src/impetuous/__init__.py
src/impetuous/clustering.py
src/impetuous/convert.py
src/impetuous/fit.py
src/impetuous/hierarchal.py
src/impetuous/impetuous.py
src/impetuous/pathways.py
src/impetuous/quantification.py
src/impetuous/reducer.py
src/impetuous/spectral.py
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册