diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index 9ad57b59e3b83e5f6f6305fa544615a301e84053..c1a908084f56c3d68d61ac181d2bc2d0ac642b35 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -1,4 +1,27 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import paths +from . import data_helpers +from . import feedforward +from . import common from .mhc1_binding_predictor import Mhc1BindingPredictor -import paths -import data_helpers -import feedforward + +__all__ = [ + "paths", + "data_helpers", + "feedforward", + "common", + "Mhc1BindingPredictor" +] \ No newline at end of file diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py index ba24c6a02320f8a63633bbcc7c01a2aa2a8e6bf4..b606a7a335fdb94d76c514af4a05b62fd065bb8f 100644 --- a/mhcflurry/amino_acid.py +++ b/mhcflurry/amino_acid.py @@ -1,3 +1,17 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + amino_acids = { "A": "Alanine", "R": "Arginine", diff --git a/mhcflurry/class1_allele_specific_hyperparameters.py b/mhcflurry/class1_allele_specific_hyperparameters.py index 4610a10824f3db7298b6b6bbd8e3990b37d71945..68368f8c402daebddd29c5bbc13bfc5c632a4cc0 100644 --- a/mhcflurry/class1_allele_specific_hyperparameters.py +++ b/mhcflurry/class1_allele_specific_hyperparameters.py @@ -1,3 +1,17 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + N_PRETRAIN_EPOCHS = 10 N_EPOCHS = 100 ACTIVATION = "relu" diff --git a/mhcflurry/common.py b/mhcflurry/common.py new file mode 100644 index 0000000000000000000000000000000000000000..17e4c4d9a4558500b7b490ae4ed0e394f0e6b6f5 --- /dev/null +++ b/mhcflurry/common.py @@ -0,0 +1,19 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +def parse_int_list(s): + return [int(part.strip() for part in s.split(","))] + +def split_peptide_sequences(s): + return [part.strip().upper() for part in s.split(",")] diff --git a/mhcflurry/data_helpers.py b/mhcflurry/data_helpers.py index b81cfe193548cf9b041c8c3272d36f06822cb572..d3143d6bac3c34d71a0173ce206a3687523dd9c5 100644 --- a/mhcflurry/data_helpers.py +++ b/mhcflurry/data_helpers.py @@ -1,3 +1,17 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from collections import namedtuple import pandas as pd import numpy as np diff --git a/mhcflurry/feedforward.py b/mhcflurry/feedforward.py index 032179d34c0aea211cf00de30f71fabf94bdedbf..8dde278626559f0d62b3c46527033f4e7cf1416a 100644 --- a/mhcflurry/feedforward.py +++ b/mhcflurry/feedforward.py @@ -1,3 +1,17 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import keras from keras.models import Sequential from keras.layers.core import Dense, Activation, Flatten, Dropout diff --git a/mhcflurry/mhc1_binding_predictor.py b/mhcflurry/mhc1_binding_predictor.py index 5594724dff52a93dca0024be585cddd395d97809..0d3814190a63c0c90263567f074443427064d872 100644 --- a/mhcflurry/mhc1_binding_predictor.py +++ b/mhcflurry/mhc1_binding_predictor.py @@ -1,3 +1,17 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ Allele specific MHC Class I binding affinity predictor """ diff --git a/mhcflurry/paths.py b/mhcflurry/paths.py index 323ad63dcd0895394a3af97e269063e3401dfadf..2a587612491296be594d028f98c4fdb01801a2fd 100644 --- a/mhcflurry/paths.py +++ b/mhcflurry/paths.py @@ -1,3 +1,17 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from os.path import join from appdirs import user_data_dir diff --git a/scripts/build-iedb-class1-dataset.py b/scripts/build-iedb-class1-dataset.py deleted file mode 100755 index 70a7fa738bf7cfc69a9b5efea27bbdd4d9fe51ac..0000000000000000000000000000000000000000 --- a/scripts/build-iedb-class1-dataset.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -Turn a raw CSV snapshot of the IEDB contents into a usable -class I binding prediction dataset by grouping all unique pMHCs -""" -from collections import defaultdict -from os.path import join -import pickle - -import numpy as np -import pandas as pd - -from mhcflurry.paths import CLASS1_DATA_DIRECTORY - -IEDB_SOURCE_FILENAME = "mhc_ligand_full.csv" -IEDB_SOURCE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_SOURCE_FILENAME) -print(IEDB_SOURCE_PATH) -OUTPUT_FILENAME = "iedb_human_class1_assay_datasets.pickle" -OUTPUT_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_FILENAME) - -if __name__ == "__main__": - df = pd.read_csv( - IEDB_SOURCE_PATH, - error_bad_lines=False, - encoding="latin-1", - header=[0, 1]) - alleles = df["MHC"]["Allele Name"] - n = len(alleles) - print("# total: %d" % n) - - mask = np.zeros(n, dtype=bool) - patterns = [ - "HLA-A", - "HLA-B", - "HLA-C", - # "H-2-D", - # "H-2-K", - # "H-2-L", - ] - for pattern in patterns: - pattern_mask = alleles.str.startswith(pattern) - print("# %s: %d" % (pattern, pattern_mask.sum())) - mask |= pattern_mask - df = df[mask] - print("# entries matching allele masks: %d" % (len(df))) - assay_group = df["Assay"]["Assay Group"] - assay_method = df["Assay"]["Method/Technique"] - groups = df.groupby([assay_group, assay_method]) - print("---") - print("Assays") - assay_dataframes = {} - # create a dataframe for every distinct kind of assay which is used - # by IEDB submitters to measure peptide-MHC affinity or stability - for (assay_group, assay_method), group_data in sorted( - groups, key=lambda x: len(x[1]), reverse=True): - print("%s (%s): %d" % (assay_group, assay_method, len(group_data))) - group_alleles = group_data["MHC"]["Allele Name"] - group_peptides = group_data["Epitope"]["Description"] - distinct_pmhc = group_data.groupby([group_alleles, group_peptides]) - columns = defaultdict(list) - for (allele, peptide), pmhc_group in distinct_pmhc: - columns["mhc"].append(allele) - columns["peptide"].append(peptide) - # performing median in log space since in two datapoint case - # we don't want to take e.g. (10 + 1000) / 2.0 = 505 - # but would prefer something like 10 ** ( (1 + 3) / 2.0) = 100 - columns["value"].append( - np.exp( - np.median( - np.log( - pmhc_group["Assay"]["Quantitative measurement"])))) - qualitative = pmhc_group["Assay"]["Qualitative Measure"] - columns["percent_positive"].append( - qualitative.str.startswith("Positive").mean()) - columns["count"].append( - pmhc_group["Assay"]["Quantitative measurement"].count()) - assay_dataframes[(assay_group, assay_method)] = pd.DataFrame( - columns, - columns=[ - "mhc", - "peptide", - "value", - "percent_positive", - "count"]) - print("# distinct pMHC entries: %d" % len(columns["mhc"])) - with open(OUTPUT_PATH, "w") as f: - pickle.dump(assay_dataframes, f, pickle.HIGHEST_PROTOCOL) diff --git a/scripts/download-iedb.sh b/scripts/download-iedb.sh index bc962debbfb7974ea0cecb0aeecfbb1cf06fa659..5bdd4ae9b5f387df0ef086141819bdf2c3af86cf 100755 --- a/scripts/download-iedb.sh +++ b/scripts/download-iedb.sh @@ -1,4 +1,19 @@ #!/usr/bin/env bash + +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + rm -f mhc_ligand_full* wget http://www.iedb.org/doc/mhc_ligand_full.zip unzip mhc_ligand_full.zip diff --git a/scripts/download-peters-2013-dataset.sh b/scripts/download-peters-2013-dataset.sh index 76d814aa85a8c8a9d7bcd506cc057b77c63a4a8d..780e8cf407ec248be7cf8642d8c08a40f2aee797 100755 --- a/scripts/download-peters-2013-dataset.sh +++ b/scripts/download-peters-2013-dataset.sh @@ -1,5 +1,19 @@ #!/usr/bin/env bash +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Download dataset from Kim/Peters 2013 "Dataset size and composition" paper rm -f bdata.20130222.mhci.public* wget https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt diff --git a/scripts/mhcflurry.py b/scripts/mhcflurry.py new file mode 100755 index 0000000000000000000000000000000000000000..b361640417902fd58612e679550f09bb6cd77c3c --- /dev/null +++ b/scripts/mhcflurry.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python + +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse + +from mhcflurry.common import parse_int_list, split_peptide_sequences + +parser = argparse.ArgumentParser() + +parser.add_argument("--mhc", + default="HLA-A*02:01", + help="Comma separated list of MHC alleles") + +parser.add_argument("--sequence", + required=True, + type=split_peptide_sequences, + help="Comma separated list of protein sequences") + +parser.add_argument("--fasta-file", + help="FASTA file of protein sequences to chop up into peptides") + +parser.add_argument("--peptide-lengths", + default=[9], + type=parse_int_list, + help="Comma separated list of peptide length, e.g. 8,9,10,11") + +if __name__ == "__main__": + args = parser.parse_args() diff --git a/setup.py b/setup.py index d1da6015dffdf5c92114374f5cb32ef31bb616cb..d32dd22d147e0625ce9c54f1c0cb6256864a6da5 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2014. Mount Sinai School of Medicine +# Copyright (c) 2015. Mount Sinai School of Medicine # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -55,7 +55,7 @@ if __name__ == '__main__': install_requires=[ 'numpy>=1.7', 'pandas>=0.13.1', - 'appdirs', + 'appdirs', ], long_description=readme, packages=['mhcflurry'],