simplified download scripts to work locally, need to eventually replace with Python

35c09c17 · Alex Rubinsteyn · 7ce7d7f5 · 35c09c17 · 35c09c17 · 35c09c17
Commit 35c09c17 authored 8 years ago by Alex Rubinsteyn
--- a/mhcflurry/__init__.py
+++ b/mhcflurry/__init__.py
@@ -14,3 +14,5 @@

 from .class1_binding_predictor import Class1BindingPredictor
 from .predict import predict
+from .ensemble import Ensemble
+from .package_metadata import __version__
--- a/mhcflurry/package_metadata.py
+++ b/mhcflurry/package_metadata.py
+
+__version__ = "0.0.2"
--- a/mhcflurry/paths.py
+++ b/mhcflurry/paths.py
@@ -15,7 +15,10 @@
 from os.path import join
 from appdirs import user_data_dir

-BASE_DIRECTORY = user_data_dir("mhcflurry", version="0.1")
+
+# increase the version of the base directory every time we make a breaking change
+# in how the data is represented or how the models are serialized
+BASE_DIRECTORY = user_data_dir("mhcflurry", version="2")
 CLASS1_DATA_DIRECTORY = join(BASE_DIRECTORY, "class1_data")
 CLASS1_MODEL_DIRECTORY = join(BASE_DIRECTORY, "class1_models")


--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,5 @@ numpy>= 1.7
 pandas>=0.13.1
 appdirs
 theano
-keras<1.0
\ No newline at end of file
+keras<1.0
+h5py
\ No newline at end of file
--- a/script/create-combined-class1-dataset.py
+++ b/script/create-combined-class1-dataset.py
@@ -26,20 +26,20 @@ from __future__ import (
    absolute_import,
    unicode_literals
 )
-from os.path import join
+from os import makedirs
+from os.path import join, exists
 import pickle
 from collections import Counter
 import argparse

 import pandas as pd

-from mhcflurry.paths import CLASS1_DATA_DIRECTORY, CLASS1_DATA_CSV_PATH
+from mhcflurry.paths import CLASS1_DATA_DIRECTORY, CLASS1_DATA_CSV_FILENAME

 IEDB_PICKLE_FILENAME = "iedb_human_class1_assay_datasets.pickle"
 IEDB_PICKLE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_PICKLE_FILENAME)

-PETERS_CSV_FILENAME = "bdata.20130222.mhci.public.1.txt"
-PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME)
+KIM_2013_CSV_FILENAME = "bdata.20130222.mhci.public.1.txt"

 parser = argparse.ArgumentParser()

@@ -71,13 +71,18 @@ parser.add_argument(

 parser.add_argument(
    "--netmhcpan-csv-path",
-    default=PETERS_CSV_PATH,
+    default=KIM_2013_CSV_FILENAME,
    help="Path to CSV with NetMHCpan dataset from 2013 Peters paper")

 parser.add_argument(
-    "--output-csv-path",
-    default=CLASS1_DATA_CSV_PATH,
-    help="Path to CSV of combined assay results")
+    "--output-dir",
+    default=CLASS1_DATA_DIRECTORY,
+    help="Path to directory where output CSV should be written")
+
+parser.add_argument(
+    "--output-csv-filename",
+    default=CLASS1_DATA_CSV_FILENAME,
+    help="Name of combined CSV file")

 parser.add_argument(
    "--extra-dataset-csv-path",
@@ -88,6 +93,9 @@ parser.add_argument(
 if __name__ == "__main__":
    args = parser.parse_args()

+    if not exists(args.output_dir):
+        makedirs(args.output_dir)
+
    print("Reading %s..." % args.iedb_pickle_path)
    with open(args.iedb_pickle_path, "rb") as f:
        iedb_datasets = pickle.load(f)

--- a/script/create-iedb-class1-dataset.py
+++ b/script/create-iedb-class1-dataset.py
@@ -19,23 +19,42 @@ Turn a raw CSV snapshot of the IEDB contents into a usable
 class I binding prediction dataset by grouping all unique pMHCs
 """
 from collections import defaultdict
-from os.path import join
+from os import makedirs
+from os.path import join, exists
 import pickle
+import argparse

 import numpy as np
 import pandas as pd

 from mhcflurry.paths import CLASS1_DATA_DIRECTORY

+
 IEDB_SOURCE_FILENAME = "mhc_ligand_full.csv"
-IEDB_SOURCE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_SOURCE_FILENAME)
+PICKLE_OUTPUT_FILENAME = "iedb_human_class1_assay_datasets.pickle"
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--input-csv",
+    default=IEDB_SOURCE_FILENAME,
+    help="CSV file with IEDB's MHC binding data")
+
+parser.add_argument(
+    "--output-dir",
+    default=CLASS1_DATA_DIRECTORY,
+    help="Directory to write output pickle file")

-OUTPUT_FILENAME = "iedb_human_class1_assay_datasets.pickle"
-OUTPUT_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_FILENAME)
+
+parser.add_argument(
+    "--output-pickle-filename",
+    default=PICKLE_OUTPUT_FILENAME,
+    help="Path to .pickle file containing dictionary of IEDB assay datasets")

 if __name__ == "__main__":
+    args = parser.parse_args()
    df = pd.read_csv(
-        IEDB_SOURCE_PATH,
+        args.input_csv,
        error_bad_lines=False,
        encoding="latin-1",
        header=[0, 1])
@@ -48,9 +67,7 @@ if __name__ == "__main__":
        "HLA-A",
        "HLA-B",
        "HLA-C",
-        # "H-2-D",
-        # "H-2-K",
-        # "H-2-L",
+        "H-2",
    ]
    for pattern in patterns:
        pattern_mask = alleles.str.startswith(pattern)
@@ -98,5 +115,10 @@ if __name__ == "__main__":
                "percent_positive",
                "count"])
        print("# distinct pMHC entries: %d" % len(columns["mhc"]))
-    with open(OUTPUT_PATH, "wb") as f:
+    if not exists(args.output_dir):
+        makedirs(args.output_dir)
+
+    output_path = join(args.output_dir, args.output_pickle_filename)
+
+    with open(args.output, "wb") as f:
        pickle.dump(assay_dataframes, f, pickle.HIGHEST_PROTOCOL)
--- a/script/download-iedb.sh
+++ b/script/download-iedb.sh
 #!/usr/bin/env bash

-# Copyright (c) 2015. Mount Sinai School of Medicine
+# Copyright (c) 2016. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,6 +17,3 @@
 rm -f mhc_ligand_full*
 wget http://www.iedb.org/doc/mhc_ligand_full.zip
 unzip mhc_ligand_full.zip
-DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"`
-mkdir -p -- "$DATA_DIR"
-mv mhc_ligand_full.csv "$DATA_DIR"
\ No newline at end of file
--- a/script/download-kim-2009-dataset.sh
+++ b/script/download-kim-2009-dataset.sh
 #!/usr/bin/env bash

-# Copyright (c) 2015. Mount Sinai School of Medicine
+# Copyright (c) 2016. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,3 @@
 # 2013 "Dataset size and composition" paper
 rm -f bdata.2009.mhci.public.1*
 wget https://dl.dropboxusercontent.com/u/3967524/bdata.2009.mhci.public.1.txt
-DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"`
-mkdir -p -- "$DATA_DIR"
-mv bdata.2009.mhci.public.1.txt "$DATA_DIR"
--- a/script/download-kim-2013-dataset.sh
+++ b/script/download-kim-2013-dataset.sh
 #!/usr/bin/env bash

-# Copyright (c) 2015. Mount Sinai School of Medicine
+# Copyright (c) 2016. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,6 +17,3 @@
 # Download dataset from Kim/Peters 2013 "Dataset size and composition" paper
 rm -f bdata.20130222.mhci.public*
 wget https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt
-DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"`
-mkdir -p -- "$DATA_DIR"
-mv bdata.20130222.mhci.public.1.txt "$DATA_DIR"
--- a/script/download-kim-blind-dataset.sh
+++ b/script/download-kim-blind-dataset.sh
 #!/usr/bin/env bash

-# Copyright (c) 2015. Mount Sinai School of Medicine
+# Copyright (c) 2016. Mount Sinai School of Medicine
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,8 +17,4 @@
 # Download BLIND 2013 dataset from Kim/Peters 2013
 # "Dataset size and composition" paper
 rm -f bdata.2013.mhci.public.blind*
-
 wget https://dl.dropboxusercontent.com/u/3967524/bdata.2013.mhci.public.blind.1.txt
-DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"`
-mkdir -p -- "$DATA_DIR"
-mv bdata.2013.mhci.public.blind.1.txt "$DATA_DIR"
--- a/script/print-data-dir.sh
+++ b/script/print-data-dir.sh
-#!/usr/bin/env bash
-DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"`
-echo "$DATA_DIR"
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -14,6 +14,7 @@

 import os
 import logging
+import re

 from setuptools import setup

@@ -34,10 +35,17 @@ except:
    logging.warn("Conversion of long_description from MD to RST failed")
    pass

+
+with open('mhcflurry/package_metadata.py', 'r') as f:
+    version = re.search(
+        r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
+        f.read(),
+        re.MULTILINE).group(1)
+
 if __name__ == '__main__':
    setup(
        name='mhcflurry',
-        version="0.0.1",
+        version=version,
        description="MHC Binding Predictor",
        author="Alex Rubinsteyn",
        author_email="alex {dot} rubinsteyn {at} mssm {dot} edu",
@@ -58,6 +66,7 @@ if __name__ == '__main__':
            'appdirs',
            'theano',
            'keras',
+            'h5py',
            # using for multi-threaded web server
            'cherrypy'
        ],