Skip to content
Snippets Groups Projects
Commit 35c09c17 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

simplified download scripts to work locally, need to eventually replace with Python

parent 7ce7d7f5
No related branches found
No related tags found
No related merge requests found
......@@ -14,3 +14,5 @@
from .class1_binding_predictor import Class1BindingPredictor
from .predict import predict
from .ensemble import Ensemble
from .package_metadata import __version__
__version__ = "0.0.2"
......@@ -15,7 +15,10 @@
from os.path import join
from appdirs import user_data_dir
BASE_DIRECTORY = user_data_dir("mhcflurry", version="0.1")
# increase the version of the base directory every time we make a breaking change
# in how the data is represented or how the models are serialized
BASE_DIRECTORY = user_data_dir("mhcflurry", version="2")
CLASS1_DATA_DIRECTORY = join(BASE_DIRECTORY, "class1_data")
CLASS1_MODEL_DIRECTORY = join(BASE_DIRECTORY, "class1_models")
......
......@@ -2,4 +2,5 @@ numpy>= 1.7
pandas>=0.13.1
appdirs
theano
keras<1.0
\ No newline at end of file
keras<1.0
h5py
\ No newline at end of file
......@@ -26,20 +26,20 @@ from __future__ import (
absolute_import,
unicode_literals
)
from os.path import join
from os import makedirs
from os.path import join, exists
import pickle
from collections import Counter
import argparse
import pandas as pd
from mhcflurry.paths import CLASS1_DATA_DIRECTORY, CLASS1_DATA_CSV_PATH
from mhcflurry.paths import CLASS1_DATA_DIRECTORY, CLASS1_DATA_CSV_FILENAME
IEDB_PICKLE_FILENAME = "iedb_human_class1_assay_datasets.pickle"
IEDB_PICKLE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_PICKLE_FILENAME)
PETERS_CSV_FILENAME = "bdata.20130222.mhci.public.1.txt"
PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME)
KIM_2013_CSV_FILENAME = "bdata.20130222.mhci.public.1.txt"
parser = argparse.ArgumentParser()
......@@ -71,13 +71,18 @@ parser.add_argument(
parser.add_argument(
"--netmhcpan-csv-path",
default=PETERS_CSV_PATH,
default=KIM_2013_CSV_FILENAME,
help="Path to CSV with NetMHCpan dataset from 2013 Peters paper")
parser.add_argument(
"--output-csv-path",
default=CLASS1_DATA_CSV_PATH,
help="Path to CSV of combined assay results")
"--output-dir",
default=CLASS1_DATA_DIRECTORY,
help="Path to directory where output CSV should be written")
parser.add_argument(
"--output-csv-filename",
default=CLASS1_DATA_CSV_FILENAME,
help="Name of combined CSV file")
parser.add_argument(
"--extra-dataset-csv-path",
......@@ -88,6 +93,9 @@ parser.add_argument(
if __name__ == "__main__":
args = parser.parse_args()
if not exists(args.output_dir):
makedirs(args.output_dir)
print("Reading %s..." % args.iedb_pickle_path)
with open(args.iedb_pickle_path, "rb") as f:
iedb_datasets = pickle.load(f)
......
......@@ -19,23 +19,42 @@ Turn a raw CSV snapshot of the IEDB contents into a usable
class I binding prediction dataset by grouping all unique pMHCs
"""
from collections import defaultdict
from os.path import join
from os import makedirs
from os.path import join, exists
import pickle
import argparse
import numpy as np
import pandas as pd
from mhcflurry.paths import CLASS1_DATA_DIRECTORY
IEDB_SOURCE_FILENAME = "mhc_ligand_full.csv"
IEDB_SOURCE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_SOURCE_FILENAME)
PICKLE_OUTPUT_FILENAME = "iedb_human_class1_assay_datasets.pickle"
parser = argparse.ArgumentParser()
parser.add_argument(
"--input-csv",
default=IEDB_SOURCE_FILENAME,
help="CSV file with IEDB's MHC binding data")
parser.add_argument(
"--output-dir",
default=CLASS1_DATA_DIRECTORY,
help="Directory to write output pickle file")
OUTPUT_FILENAME = "iedb_human_class1_assay_datasets.pickle"
OUTPUT_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_FILENAME)
parser.add_argument(
"--output-pickle-filename",
default=PICKLE_OUTPUT_FILENAME,
help="Path to .pickle file containing dictionary of IEDB assay datasets")
if __name__ == "__main__":
args = parser.parse_args()
df = pd.read_csv(
IEDB_SOURCE_PATH,
args.input_csv,
error_bad_lines=False,
encoding="latin-1",
header=[0, 1])
......@@ -48,9 +67,7 @@ if __name__ == "__main__":
"HLA-A",
"HLA-B",
"HLA-C",
# "H-2-D",
# "H-2-K",
# "H-2-L",
"H-2",
]
for pattern in patterns:
pattern_mask = alleles.str.startswith(pattern)
......@@ -98,5 +115,10 @@ if __name__ == "__main__":
"percent_positive",
"count"])
print("# distinct pMHC entries: %d" % len(columns["mhc"]))
with open(OUTPUT_PATH, "wb") as f:
if not exists(args.output_dir):
makedirs(args.output_dir)
output_path = join(args.output_dir, args.output_pickle_filename)
with open(args.output, "wb") as f:
pickle.dump(assay_dataframes, f, pickle.HIGHEST_PROTOCOL)
#!/usr/bin/env bash
# Copyright (c) 2015. Mount Sinai School of Medicine
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -17,6 +17,3 @@
rm -f mhc_ligand_full*
wget http://www.iedb.org/doc/mhc_ligand_full.zip
unzip mhc_ligand_full.zip
DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"`
mkdir -p -- "$DATA_DIR"
mv mhc_ligand_full.csv "$DATA_DIR"
\ No newline at end of file
#!/usr/bin/env bash
# Copyright (c) 2015. Mount Sinai School of Medicine
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -18,6 +18,3 @@
# 2013 "Dataset size and composition" paper
rm -f bdata.2009.mhci.public.1*
wget https://dl.dropboxusercontent.com/u/3967524/bdata.2009.mhci.public.1.txt
DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"`
mkdir -p -- "$DATA_DIR"
mv bdata.2009.mhci.public.1.txt "$DATA_DIR"
#!/usr/bin/env bash
# Copyright (c) 2015. Mount Sinai School of Medicine
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -17,6 +17,3 @@
# Download dataset from Kim/Peters 2013 "Dataset size and composition" paper
rm -f bdata.20130222.mhci.public*
wget https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt
DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"`
mkdir -p -- "$DATA_DIR"
mv bdata.20130222.mhci.public.1.txt "$DATA_DIR"
#!/usr/bin/env bash
# Copyright (c) 2015. Mount Sinai School of Medicine
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -17,8 +17,4 @@
# Download BLIND 2013 dataset from Kim/Peters 2013
# "Dataset size and composition" paper
rm -f bdata.2013.mhci.public.blind*
wget https://dl.dropboxusercontent.com/u/3967524/bdata.2013.mhci.public.blind.1.txt
DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"`
mkdir -p -- "$DATA_DIR"
mv bdata.2013.mhci.public.blind.1.txt "$DATA_DIR"
#!/usr/bin/env bash
DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"`
echo "$DATA_DIR"
\ No newline at end of file
......@@ -14,6 +14,7 @@
import os
import logging
import re
from setuptools import setup
......@@ -34,10 +35,17 @@ except:
logging.warn("Conversion of long_description from MD to RST failed")
pass
with open('mhcflurry/package_metadata.py', 'r') as f:
version = re.search(
r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
f.read(),
re.MULTILINE).group(1)
if __name__ == '__main__':
setup(
name='mhcflurry',
version="0.0.1",
version=version,
description="MHC Binding Predictor",
author="Alex Rubinsteyn",
author_email="alex {dot} rubinsteyn {at} mssm {dot} edu",
......@@ -58,6 +66,7 @@ if __name__ == '__main__':
'appdirs',
'theano',
'keras',
'h5py',
# using for multi-threaded web server
'cherrypy'
],
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment