Skip to content
Snippets Groups Projects
Commit 8a6b85f1 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

update

parent d3600b1f
No related merge requests found
...@@ -81,6 +81,11 @@ time python make_allele_sequences.py \ ...@@ -81,6 +81,11 @@ time python make_allele_sequences.py \
--differentiate-alleles training_data.alleles.txt \ --differentiate-alleles training_data.alleles.txt \
--out-csv allele_sequences.csv --out-csv allele_sequences.csv
time python make_allele_sequences.py \
class1.aligned.fasta \
--recapitulate-sequences class1_pseudosequences.csv \
--out-csv allele_sequences.no_differentiation.csv
# Cleanup # Cleanup
gzip -f class1.fasta gzip -f class1.fasta
gzip -f class1.aligned.fasta gzip -f class1.aligned.fasta
...@@ -88,6 +93,6 @@ rm *.fasta ...@@ -88,6 +93,6 @@ rm *.fasta
cp $SCRIPT_ABSOLUTE_PATH . cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt bzip2 LOG.txt
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2" echo "Created archive: $RESULT"
\ No newline at end of file
...@@ -133,6 +133,7 @@ def run(): ...@@ -133,6 +133,7 @@ def run():
assert agreement > 0.9 assert agreement > 0.9
# Add additional positions # Add additional positions
additional_positions = []
if args.differentiate_alleles: if args.differentiate_alleles:
differentiate_alleles = pandas.read_csv( differentiate_alleles = pandas.read_csv(
args.differentiate_alleles).iloc[:,0].values args.differentiate_alleles).iloc[:,0].values
......
...@@ -62,9 +62,11 @@ if [ "$2" != "continue-incomplete" ] ...@@ -62,9 +62,11 @@ if [ "$2" != "continue-incomplete" ]
then then
cp $SCRIPT_DIR/generate_hyperparameters.production.py . cp $SCRIPT_DIR/generate_hyperparameters.production.py .
cp $SCRIPT_DIR/generate_hyperparameters.py . cp $SCRIPT_DIR/generate_hyperparameters.py .
cp $SCRIPT_DIR/normalize_allele_names.py .
python generate_hyperparameters.production.py > hyperparameters.production.yaml python generate_hyperparameters.production.py > hyperparameters.production.yaml
python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml python generate_hyperparameters.py hyperparameters.production.yaml no_pretrain > hyperparameters.no_pretrain.yaml
python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml python generate_hyperparameters.py hyperparameters.no_pretrain.yaml single_hidden > hyperparameters.single_hidden_no_pretrain.yaml
python normalize_allele_names.py "$(mhcflurry-downloads path allele_sequences)/class1_pseudosequences.csv" --out allele_sequences.34mer.csv
fi fi
for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence for kind in single_hidden_no_pretrain no_pretrain 34mer_sequence
...@@ -80,7 +82,7 @@ do ...@@ -80,7 +82,7 @@ do
HYPERPARAMETERS=hyperparameters.$kind.yaml HYPERPARAMETERS=hyperparameters.$kind.yaml
if [ "$kind" == "34mer_sequence" ] if [ "$kind" == "34mer_sequence" ]
then then
ALLELE_SEQUENCES="$(mhcflurry-downloads path allele_sequences)/class1_pseudosequences.csv" ALLELE_SEQUENCES=allele_sequences.34mer.csv
HYPERPARAMETERS=hyperparameters.production.yaml HYPERPARAMETERS=hyperparameters.production.yaml
fi fi
......
"""
Normalize MHC allele names
"""
from sys import argv
import os
import pandas
import mhcnames
import argparse
def normalize(s, disallowed=["MIC", "HFE"]):
if any(item in s for item in disallowed):
return None
try:
return mhcnames.normalize_allele_name(s)
except:
while s:
s = ":".join(s.split(":")[:-1])
try:
return mhcnames.normalize_allele_name(s)
except:
pass
return None
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument("input_csv")
parser.add_argument("--out", help="CSV output")
args = parser.parse_args(argv[1:])
df = pandas.read_csv(args.input_csv)
print("Read df with shape", df.shape)
df["allele"] = df["allele"].map(normalize)
df = df.loc[~df.allele.isnull()]
print("Done normalizing. After removing unparseable names, shape is", df.shape)
df = df.drop_duplicates("allele")
print("After dropping duplicates", df.shape)
df.to_csv(args.out, index=False)
print("Wrote", os.path.abspath(args.out))
...@@ -66,7 +66,7 @@ releases: ...@@ -66,7 +66,7 @@ releases:
default: false default: false
- name: data_curated - name: data_curated
url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20191011.tar.bz2 url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20191030.tar.bz2
default: true default: true
# Older downloads # Older downloads
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment