Skip to content
Snippets Groups Projects
Commit ddf6a312 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

added skeleton of commandline frontend

parent b60d22a8
No related branches found
No related tags found
No related merge requests found
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import paths
from . import data_helpers
from . import feedforward
from . import common
from .mhc1_binding_predictor import Mhc1BindingPredictor
import paths
import data_helpers
import feedforward
__all__ = [
"paths",
"data_helpers",
"feedforward",
"common",
"Mhc1BindingPredictor"
]
\ No newline at end of file
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
amino_acids = {
"A": "Alanine",
"R": "Arginine",
......
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
N_PRETRAIN_EPOCHS = 10
N_EPOCHS = 100
ACTIVATION = "relu"
......
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def parse_int_list(s):
return [int(part.strip() for part in s.split(","))]
def split_peptide_sequences(s):
return [part.strip().upper() for part in s.split(",")]
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import namedtuple
import pandas as pd
import numpy as np
......
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
......
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Allele specific MHC Class I binding affinity predictor
"""
......
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from os.path import join
from appdirs import user_data_dir
......
"""
Turn a raw CSV snapshot of the IEDB contents into a usable
class I binding prediction dataset by grouping all unique pMHCs
"""
from collections import defaultdict
from os.path import join
import pickle
import numpy as np
import pandas as pd
from mhcflurry.paths import CLASS1_DATA_DIRECTORY
IEDB_SOURCE_FILENAME = "mhc_ligand_full.csv"
IEDB_SOURCE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_SOURCE_FILENAME)
print(IEDB_SOURCE_PATH)
OUTPUT_FILENAME = "iedb_human_class1_assay_datasets.pickle"
OUTPUT_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_FILENAME)
if __name__ == "__main__":
df = pd.read_csv(
IEDB_SOURCE_PATH,
error_bad_lines=False,
encoding="latin-1",
header=[0, 1])
alleles = df["MHC"]["Allele Name"]
n = len(alleles)
print("# total: %d" % n)
mask = np.zeros(n, dtype=bool)
patterns = [
"HLA-A",
"HLA-B",
"HLA-C",
# "H-2-D",
# "H-2-K",
# "H-2-L",
]
for pattern in patterns:
pattern_mask = alleles.str.startswith(pattern)
print("# %s: %d" % (pattern, pattern_mask.sum()))
mask |= pattern_mask
df = df[mask]
print("# entries matching allele masks: %d" % (len(df)))
assay_group = df["Assay"]["Assay Group"]
assay_method = df["Assay"]["Method/Technique"]
groups = df.groupby([assay_group, assay_method])
print("---")
print("Assays")
assay_dataframes = {}
# create a dataframe for every distinct kind of assay which is used
# by IEDB submitters to measure peptide-MHC affinity or stability
for (assay_group, assay_method), group_data in sorted(
groups, key=lambda x: len(x[1]), reverse=True):
print("%s (%s): %d" % (assay_group, assay_method, len(group_data)))
group_alleles = group_data["MHC"]["Allele Name"]
group_peptides = group_data["Epitope"]["Description"]
distinct_pmhc = group_data.groupby([group_alleles, group_peptides])
columns = defaultdict(list)
for (allele, peptide), pmhc_group in distinct_pmhc:
columns["mhc"].append(allele)
columns["peptide"].append(peptide)
# performing median in log space since in two datapoint case
# we don't want to take e.g. (10 + 1000) / 2.0 = 505
# but would prefer something like 10 ** ( (1 + 3) / 2.0) = 100
columns["value"].append(
np.exp(
np.median(
np.log(
pmhc_group["Assay"]["Quantitative measurement"]))))
qualitative = pmhc_group["Assay"]["Qualitative Measure"]
columns["percent_positive"].append(
qualitative.str.startswith("Positive").mean())
columns["count"].append(
pmhc_group["Assay"]["Quantitative measurement"].count())
assay_dataframes[(assay_group, assay_method)] = pd.DataFrame(
columns,
columns=[
"mhc",
"peptide",
"value",
"percent_positive",
"count"])
print("# distinct pMHC entries: %d" % len(columns["mhc"]))
with open(OUTPUT_PATH, "w") as f:
pickle.dump(assay_dataframes, f, pickle.HIGHEST_PROTOCOL)
#!/usr/bin/env bash
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
rm -f mhc_ligand_full*
wget http://www.iedb.org/doc/mhc_ligand_full.zip
unzip mhc_ligand_full.zip
......
#!/usr/bin/env bash
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Download dataset from Kim/Peters 2013 "Dataset size and composition" paper
rm -f bdata.20130222.mhci.public*
wget https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt
......
#!/usr/bin/env python
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from mhcflurry.common import parse_int_list, split_peptide_sequences
parser = argparse.ArgumentParser()
parser.add_argument("--mhc",
default="HLA-A*02:01",
help="Comma separated list of MHC alleles")
parser.add_argument("--sequence",
required=True,
type=split_peptide_sequences,
help="Comma separated list of protein sequences")
parser.add_argument("--fasta-file",
help="FASTA file of protein sequences to chop up into peptides")
parser.add_argument("--peptide-lengths",
default=[9],
type=parse_int_list,
help="Comma separated list of peptide length, e.g. 8,9,10,11")
if __name__ == "__main__":
args = parser.parse_args()
# Copyright (c) 2014. Mount Sinai School of Medicine
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -55,7 +55,7 @@ if __name__ == '__main__':
install_requires=[
'numpy>=1.7',
'pandas>=0.13.1',
'appdirs',
'appdirs',
],
long_description=readme,
packages=['mhcflurry'],
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment