diff --git a/README.md b/README.md index f8c05f3f19d1878fff2ac0ed69733e562b95a19b..2d69f7b6e3ad4a76b3055931ce826db96f68f983 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,10 @@ -Workflow for generating phenotype score combinations and correlating them to biofilm. + There is one rule: no Excel. Every time I use excel, I have to rename the file and they get lost and I can't retrace my steps. Forcing no excel, I can see every step and fix them where I need to. First things first: - -1. Generate normalized scores from the sorted scores. - * A sorted score is a the average of the raw scores from the biological replicates. An individual photo is a biological replicate. - - * `score_wrangler.R` takes in the un-normalized scores and generates a normalized column using the `preProcess()` function from the `caret` package. - - * This program will also remove data that we do not want (we removed certain non-albicans *Candida* species that didn't grow under certain conditions. - - * After this, the files are modified with `column_clean.py` (called inside the R script) to remove the leading column and to clean up the column content if necessary. - - * Finally, the program makes a file with all the score data in it. Repeatability. No Excel. - - * I also had it combine all the scores. That just made things a lot easier. - -2. Correlate all the normalized sum scores with biofilm. +1. Correlate all the normalized sum scores with biofilm. * I need a table for these that include the information on what scores are included in the composite scores, the media, and the temperature, as well as the correlation metrics. * `additive_correlator.R` Using the `cor.test()` function described by [STHDA](http://www.sthda.com/english/wiki/correlation-test-between-two-variables-in-r) diff --git a/column_clean.py b/column_clean.py deleted file mode 100644 index 2a5f47b366ee384d658ab67a44d65c2ea4148f26..0000000000000000000000000000000000000000 --- a/column_clean.py +++ /dev/null @@ -1,45 +0,0 @@ -import sys - -file_name = sys.argv[1] - -file = open( file_name, "r" ) - -biofilm = 0 -if "biofilm" in file_name: - biofilm = 1 - -# Gross header : MAY.Strain.. Species Soll.Clade Isolation.Site MTL.Genotype Media Temperature..C. MJD.Phenotype.Score MJD.Score.St..Dev. RJF.Phenotype.Score RJF.Score.St..Dev. Total.Average.Phenotype.Score Total.Phenotype.Score.St..Dev. Normalized.Scores -new_header = "May Strain, Species, Soll Clade, Isolation Site, MTL Genotype, Media, Temperature ("+ u"\N{DEGREE SIGN}" + "C), MJD Phenotype Score, MJD Score St. Dev., RJF Phenotype Score, RJF Score St. Dev., Total Average Phenotype Score, Total Phenotype Score St. Dev., Normalized Scores" -biofilm_header = "May Strain, Species, Soll Clade, Isolation Site, Media, Temperature ("+ u"\N{DEGREE SIGN}" + "C), Total Average Phenotype Score, Total Phenotype Score St. Dev., Normalized Scores" - -new_file = open( "nc_" + file_name[5:],"w") #nc = normalized clean - -header = 1 -for lines in file: - - if header: - - if biofilm: - print( biofilm_header, file = new_file ) - else: - print( new_header, file = new_file ) - - file_header = lines.split(",") - header = 0 - continue - - line_list = lines.split(",") - - if biofilm: - new_line_list = line_list[0:] - else: - new_line_list = line_list[0:] - # new_line_list = line_list[1:7] # Include these two lines if you want to remove the per-person scoring - # new_line_list.extend( line_list[11:] ) - - line_str = ",".join(new_line_list).strip() - - print( line_str, file = new_file ) - -file.close() -new_file.close() diff --git a/score_wrangler.py b/score_wrangler.py deleted file mode 100644 index 49e9103b8e9c6fd459b4c688734453351dd55882..0000000000000000000000000000000000000000 --- a/score_wrangler.py +++ /dev/null @@ -1,10 +0,0 @@ -adh_file = open( "adhesion_scores_sorted.csv", "r" ) -fil_file = open( "filamentation_scores_sorted.csv", "r") -inv_file = open( "invasion_scores_sorted.csv","r" ) - - - - -adh_file.close() -fil_file.close() -inv_file.close() \ No newline at end of file