From c0f827b56ddbc0f259d9d617754cac913e0e9998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Tj=C3=B6rnhammar?= Date: Wed, 23 Jun 2021 11:05:49 +0200 Subject: [PATCH] encoding method --- README.md | 2 +- setup.py | 2 +- src/impetuous/quantification.py | 40 +++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8a37957..f7101d4 100755 --- a/README.md +++ b/README.md @@ -298,7 +298,7 @@ which will report that |Hierarchical,p | 6.55459e-05 | |Name: | R-HSA-390522 | -is affected or perhaps needs to be compensated for... now perhaps you thought this excerice was a tad tedious? Well you are correct. It is and you could just as well have copied the gene transcripts into [String-db](https://string-db.org/cgi/input?sessionId=beIptQQxF85j&input_page_active_form=multiple_identifiers) and gotten similar results out. But, then you wouldn't have gotten to use the hierarchical enrichment method I invented! +is affected or perhaps needs to be compensated for... now perhaps you thought this exercise was a tad tedious? Well you are correct. It is and you could just as well have copied the gene transcripts into [String-db](https://string-db.org/cgi/input?sessionId=beIptQQxF85j&input_page_active_form=multiple_identifiers) and gotten similar results out. But, then you wouldn't have gotten to use the hierarchical enrichment method I invented! These examples were meant as illustrations of some of the codes implemented in the impetuous-gfa package. diff --git a/setup.py b/setup.py index 367be17..c468678 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ with open("README.md", "r") as fh: setuptools.setup( name = "impetuous-gfa", - version = "0.50.1", + version = "0.51.0", author = "Richard Tjörnhammar", author_email = "richard.tjornhammar@gmail.com", description = "Impetuous Quantification, a Statistical Learning library for Humans : Alignments, Clustering, Enrichments and Group Analysis", diff --git a/src/impetuous/quantification.py b/src/impetuous/quantification.py index 362c3de..1fc4aa6 100755 --- a/src/impetuous/quantification.py +++ b/src/impetuous/quantification.py @@ -134,6 +134,46 @@ def find_category_interactions ( istr ) : interacting_categories = [ [all_cats[i-1],all_cats[i]] for i in range(1,len(interacting)) if interacting[i] ] return ( interacting_categories ) + +def create_encoding_data_frame ( journal_df , formula , bVerbose=False ) : + # + # THE JOURNAL_DF IS THE COARSE GRAINED DATA (THE MODEL) + # THE FORMULA IS THE SEMANTIC DESCRIPTION OF THE PROBLEM + # + interaction_pairs = find_category_interactions ( formula.split('~')[1] ) + add_pairs = [] + sjdf = set(journal_df.index) + if len( interaction_pairs ) > 0 : + for pair in interaction_pairs : + cpair = [ 'C('+p+')' for p in pair ] + upair = [ pp*(pp in sjdf)+cp*(cp in sjdf and not pp in sjdf) for (pp,cp) in zip( pair,cpair) ] + journal_df.loc[ ':'.join(upair) ] = [ p[0]+'-'+p[1] for p in journal_df.loc[ upair,: ].T.values ] + add_pairs.append(':'.join(upair)) + use_categories = list(set(find_category_variables(formula.split('~')[1]))) + cusecats = [ 'C('+p+')' for p in use_categories ] + use_categories = [ u*( u in sjdf) + cu *( cu in sjdf ) for (u,cu) in zip(use_categories,cusecats) ] + use_categories = [ *use_categories,*add_pairs ] + # + if len( use_categories )>0 : + encoding_df = create_encoding_journal ( use_categories , journal_df ).T + else : + encoding_df = None + # + if bVerbose : + print ( [ v for v in encoding_df.columns.values ] ) + print ( 'ADD IN ANY LINEAR TERMS AS THEIR OWN AXIS' ) + # + # THIS TURNS THE MODEL INTO A MIXED LINEAR MODEL + add_df = journal_df.loc[ [c.replace(' ','') for c in formula.split('~')[1].split('+') if not 'C('in c],: ] + if len(add_df)>0 : + if encoding_df is None : + encoding_df = add_df.T + else : + encoding_df = pd.concat([ encoding_df.T , + journal_df.loc[ [ c.replace(' ','') for c in formula.split('~')[1].split('+') if not 'C(' in c] , : ] ]).T + return ( encoding_df ) + + def interpret_problem ( analyte_df , journal_df , formula , bVerbose=False ) : # # THE JOURNAL_DF IS THE COARSE GRAINED DATA (THE MODEL) -- GitLab