Skip to content
Snippets Groups Projects
Commit 138f9170 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

update

parent cf72f9aa
No related branches found
No related tags found
No related merge requests found
...@@ -81,6 +81,11 @@ time python make_allele_sequences.py \ ...@@ -81,6 +81,11 @@ time python make_allele_sequences.py \
--differentiate-alleles training_data.alleles.txt \ --differentiate-alleles training_data.alleles.txt \
--out-csv allele_sequences.csv --out-csv allele_sequences.csv
time python make_allele_sequences.py \
class1.aligned.fasta \
--recapitulate-sequences class1_pseudosequences.csv \
--out-csv allele_sequences.no_differentiation.csv
# Cleanup # Cleanup
gzip -f class1.fasta gzip -f class1.fasta
gzip -f class1.aligned.fasta gzip -f class1.aligned.fasta
...@@ -88,6 +93,6 @@ rm *.fasta ...@@ -88,6 +93,6 @@ rm *.fasta
cp $SCRIPT_ABSOLUTE_PATH . cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt bzip2 LOG.txt
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2" echo "Created archive: $RESULT"
\ No newline at end of file
...@@ -133,6 +133,7 @@ def run(): ...@@ -133,6 +133,7 @@ def run():
assert agreement > 0.9 assert agreement > 0.9
# Add additional positions # Add additional positions
additional_positions = []
if args.differentiate_alleles: if args.differentiate_alleles:
differentiate_alleles = pandas.read_csv( differentiate_alleles = pandas.read_csv(
args.differentiate_alleles).iloc[:,0].values args.differentiate_alleles).iloc[:,0].values
......
...@@ -62,9 +62,11 @@ if [ "$2" != "continue-incomplete" ] ...@@ -62,9 +62,11 @@ if [ "$2" != "continue-incomplete" ]
then then
cp $SCRIPT_DIR/generate_hyperparameters.production.py . cp $SCRIPT_DIR/generate_hyperparameters.production.py .
cp $SCRIPT_DIR/generate_hyperparameters.py . cp $SCRIPT_DIR/generate_hyperparameters.py .
cp $SCRIPT_DIR/normalize_allele_names.py .
python generate_hyperparameters.production.py > hyperparameters.production.yaml python generate_hyperparameters.production.py > hyperparameters.production.yaml
python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml
python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml
python normalize_allele_names.py "$(mhcflurry-downloads path allele_sequences)/class1_pseudosequences.csv" --out allele_sequences.34mer.csv
fi fi
for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence
...@@ -80,7 +82,7 @@ do ...@@ -80,7 +82,7 @@ do
HYPERPARAMETERS=hyperparameters.$kind.yaml HYPERPARAMETERS=hyperparameters.$kind.yaml
if [ "$kind" == "34mer_sequence" ] if [ "$kind" == "34mer_sequence" ]
then then
ALLELE_SEQUENCES="$(mhcflurry-downloads path allele_sequences)/class1_pseudosequences.csv" ALLELE_SEQUENCES=allele_sequences.34mer.csv
HYPERPARAMETERS=hyperparameters.production.yaml HYPERPARAMETERS=hyperparameters.production.yaml
fi fi
......
"""
Normalize MHC allele names
"""
from sys import argv
import os
import pandas
import mhcnames
import argparse
def normalize(s, disallowed=["MIC", "HFE"]):
if any(item in s for item in disallowed):
return None
try:
return mhcnames.normalize_allele_name(s)
except:
while s:
s = ":".join(s.split(":")[:-1])
try:
return mhcnames.normalize_allele_name(s)
except:
pass
return None
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument("input_csv")
parser.add_argument("--out", help="CSV output")
args = parser.parse_args(argv[1:])
df = pandas.read_csv(args.input_csv)
print("Read df with shape", df.shape)
df["allele"] = df["allele"].map(normalize)
df = df.loc[~df.allele.isnull()]
print("Done normalizing. After removing unparseable names, shape is", df.shape)
df = df.drop_duplicates("allele")
print("After dropping duplicates", df.shape)
df.to_csv(args.out, index=False)
print("Wrote", os.path.abspath(args.out))
...@@ -66,7 +66,7 @@ releases: ...@@ -66,7 +66,7 @@ releases:
default: false default: false
- name: data_curated - name: data_curated
url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20191011.tar.bz2 url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20191030.tar.bz2
default: true default: true
# Older downloads # Older downloads
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment