diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh index eb7b5a73f60397fb30cca74c96be3e17f8ad339d..2c404381249f71702c98d66011245dc77f4cdbea 100755 --- a/downloads-generation/data_curated/GENERATE.sh +++ b/downloads-generation/data_curated/GENERATE.sh @@ -48,24 +48,15 @@ do done time python curate_ms_by_pmid.py $CURATE_BY_PMID_ARGS \ - --ms-out ms.nontraining_curated.by_pmid.csv \ + --ms-out ms.by_pmid.csv \ --expression-out rna_expression.csv \ --expression-metadata-out rna_expression.metadata.csv -bzip2 ms.nontraining_curated.by_pmid.csv +bzip2 ms.by_pmid.csv bzip2 rna_expression.csv - rm -rf ms -# No mass-spec data -time python curate.py \ - --data-iedb \ - "$(mhcflurry-downloads path data_iedb)/mhc_ligand_full.csv.bz2" \ - --data-kim2014 \ - "$(mhcflurry-downloads path data_published)/bdata.20130222.mhci.public.1.txt" \ - --out-csv curated_training_data.no_mass_spec.csv - # With mass-spec data time python curate.py \ --data-iedb \ @@ -74,11 +65,15 @@ time python curate.py \ "$(mhcflurry-downloads path data_published)/bdata.20130222.mhci.public.1.txt" \ --data-systemhc-atlas \ "$(mhcflurry-downloads path data_systemhcatlas)/data.csv.bz2" \ - --include-iedb-mass-spec \ - --out-csv curated_training_data.with_mass_spec.csv + --data-additional-ms "$(pwd)/ms.by_pmid.csv.bz2" \ + --out-csv curated_training_data.csv \ + --out-affinity-csv curated_training_data.affinity.csv \ + --out-mass-spec-csv curated_training_data.mass_spec.csv -bzip2 curated_training_data.no_mass_spec.csv -bzip2 curated_training_data.with_mass_spec.csv +for i in $(ls *.csv) +do + bzip2 $i +done cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt diff --git a/downloads-generation/data_curated/curate.py b/downloads-generation/data_curated/curate.py index 5f99ccd580d119d0cf17a732feb05cbeaa71163d..e14680b678fc6934ea0ac747ea9a278f9a7024b9 100755 --- a/downloads-generation/data_curated/curate.py +++ b/downloads-generation/data_curated/curate.py @@ -3,6 +3,7 @@ Filter and combine various peptide/MHC datasets to derive a composite training s optionally including eluted peptides identified by mass-spec. """ import sys +import os import argparse import pandas @@ -29,20 +30,28 @@ parser.add_argument( action="append", default=[], help="Path to IEDB-style affinity data (e.g. mhc_ligand_full.csv)") +parser.add_argument( + "--data-additional-ms", + action="append", + default=[], + help="Path to additional monoallelic mass spec hits") parser.add_argument( "--data-systemhc-atlas", action="append", default=[], help="Path to systemhc-atlas-style mass-spec data") -parser.add_argument( - "--include-iedb-mass-spec", - action="store_true", - default=False, - help="Include mass-spec observations in IEDB") parser.add_argument( "--out-csv", required=True, + help="Combined result file") +parser.add_argument( + "--out-affinity-csv", + required=False, + help="Result file") +parser.add_argument( + "--out-mass-spec-csv", + required=False, help="Result file") QUALITATIVE_TO_AFFINITY_AND_INEQUALITY = { @@ -70,6 +79,7 @@ def load_data_kim2014(filename): df = pandas.read_table(filename) print("Loaded kim2014 data: %s" % str(df.shape)) df["measurement_source"] = "kim2014" + df["measurement_kind"] = "affinity" df["measurement_value"] = df.meas df["measurement_type"] = (df.inequality == "=").map({ True: "quantitative", @@ -91,6 +101,7 @@ def load_data_systemhc_atlas(filename, min_probability=0.99): df = pandas.read_csv(filename) print("Loaded systemhc atlas data: %s" % str(df.shape)) + df["measurement_kind"] = "mass_spec" df["measurement_source"] = "systemhc-atlas" df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"] df["measurement_inequality"] = "<" @@ -115,7 +126,7 @@ def load_data_systemhc_atlas(filename, min_probability=0.99): return df -def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False): +def load_data_iedb(iedb_csv, include_qualitative=True): iedb_df = pandas.read_csv(iedb_csv, skiprows=1, low_memory=False) print("Loaded iedb data: %s" % str(iedb_df.shape)) @@ -154,6 +165,7 @@ def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False): print("IEDB measurements per allele:\n%s" % iedb_df.allele.value_counts()) quantitative = iedb_df.loc[iedb_df["Units"] == "nM"].copy() + quantitative["measurement_kind"] = "affinity" quantitative["measurement_type"] = "quantitative" quantitative["measurement_inequality"] = quantitative[ "Measurement Inequality" @@ -162,11 +174,13 @@ def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False): qualitative = iedb_df.loc[iedb_df["Units"].isnull()].copy() qualitative["measurement_type"] = "qualitative" + qualitative["measurement_kind"] = qualitative[ + "Method/Technique" + ].str.contains("mass spec").map({ + True: "mass_spec", + False: "affinity", + }) print("Qualitative measurements: %d" % len(qualitative)) - if not include_mass_spec: - qualitative = qualitative.loc[ - (~qualitative["Method/Technique"].str.contains("mass spec")) - ].copy() qualitative["Quantitative measurement"] = ( qualitative["Qualitative Measure"].map(QUALITATIVE_TO_AFFINITY)) @@ -213,17 +227,40 @@ def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False): train_data["allele"] = iedb_df["allele"].values train_data["original_allele"] = iedb_df["Allele Name"].values train_data["measurement_type"] = iedb_df["measurement_type"].values + train_data["measurement_kind"] = iedb_df["measurement_kind"].values train_data = train_data.drop_duplicates().reset_index(drop=True) return train_data +def load_data_additional_ms(filename): + df = pandas.read_csv(filename) + print("Loaded additional MS", filename, df.shape) + print(df) + print("Entries:", len(df)) + + print("Subselecting to monoallelic") + df = df.loc[ + df.format == "MONOALLELIC" + ].copy() + print("Now", len(df)) + + df["allele"] = df["hla"].map(normalize_allele_name) + assert not (df.allele == "UNKNOWN").any() + df["measurement_value"] = QUALITATIVE_TO_AFFINITY["Positive"] + df["measurement_inequality"] = "<" + df["measurement_type"] = "qualitative" + df["measurement_kind"] = "mass_spec" + df["measurement_source"] = "MS:pmid:" + df["original_pmid"].map(str) + return df + + def run(): args = parser.parse_args(sys.argv[1:]) dfs = [] for filename in args.data_iedb: - df = load_data_iedb(filename, include_mass_spec=args.include_iedb_mass_spec) + df = load_data_iedb(filename) dfs.append(df) for filename in args.data_kim2014: df = load_data_kim2014(filename) @@ -243,11 +280,16 @@ def run(): df = load_data_systemhc_atlas(filename) dfs.append(df) + for filename in args.data_additional_ms: + df = load_data_additional_ms(filename) + dfs.append(df) + df = pandas.concat(dfs, ignore_index=True) print("Combined df: %s" % (str(df.shape))) print("Removing combined duplicates") - df = df.drop_duplicates(["allele", "peptide", "measurement_value"]) + df = df.drop_duplicates( + ["allele", "peptide", "measurement_value", "measurement_kind"]) print("New combined df: %s" % (str(df.shape))) df = df[[ @@ -256,14 +298,32 @@ def run(): "measurement_value", "measurement_inequality", "measurement_type", + "measurement_kind", "measurement_source", "original_allele", ]].sort_values(["allele", "peptide"]).dropna() print("Final combined df: %s" % (str(df.shape))) + print("Measurement sources:") + print(df.measurement_source.value_counts()) + + print("Measurement kind:") + print(df.measurement_kind.value_counts()) + df.to_csv(args.out_csv, index=False) - print("Wrote: %s" % args.out_csv) + print("Wrote: %s" % os.path.abspath(args.out_csv)) + + if args.out_affinity_csv: + df.loc[df.measurement_kind == "affinity"].to_csv( + args.out_affinity_csv, index=False) + print("Wrote: %s" % os.path.abspath(args.out_affinity_csv)) + + if args.out_mass_spec_csv: + df.loc[df.measurement_kind == "mass_spec"].to_csv( + args.out_mass_spec_csv, index=False) + print("Wrote: %s" % os.path.abspath(args.out_mass_spec_csv)) + if __name__ == '__main__': run() diff --git a/downloads-generation/data_curated/curate_ms_by_pmid.py b/downloads-generation/data_curated/curate_ms_by_pmid.py index 91c0db5d46229fb0c694ed81b09bb9d917f0d3c1..46645ff695404f9506c7b904d9a605ab9c3e7404 100755 --- a/downloads-generation/data_curated/curate_ms_by_pmid.py +++ b/downloads-generation/data_curated/curate_ms_by_pmid.py @@ -762,6 +762,96 @@ def handle_pmid_31154438(*filenames): result_df = pandas.concat(results, ignore_index=True) return result_df + +def handle_pmid_31844290(*filenames): + """Sarkizova, ..., Keskin Nature Biotechnology 2019 [PMID 31844290]""" + (mono_filename, multi_filename) = sorted(filenames) + + # Monoallelic + mono = pandas.read_excel(mono_filename, sheet_name=None) + dfs = [] + for (key, value) in mono.items(): + if key == 'Sheet1': + continue + allele = normalize_allele_name("HLA-" + key) + assert allele != "UNKNOWN" + df = pandas.DataFrame({"peptide": value.sequence.values}) + df["sample_id"] = "keskin_%s" % key + df["hla"] = allele + df["pulldown_antibody"] = "W6/32" + df["format"] = "monoallelic" + df["mhc_class"] = "I" + df["sample_type"] = "B-CELL" + df["cell_line"] = "b721" + dfs.append(df) + + # Multiallelic + multi = pandas.read_excel(multi_filename, sheet_name=None) + metadata = multi['Tissue Sample Characteristics'] + allele_table = metadata.drop_duplicates( + "Clinical ID").set_index("Clinical ID").loc[ + :, [c for c in metadata if c.startswith("HLA-")] + ] + allele_table = allele_table.loc[~allele_table.index.isnull()] + allele_table = allele_table.loc[allele_table["HLA-A"] != 'n.d.'] + allele_table = allele_table.applymap( + lambda s: s[1:] if s.startswith("-") else s) + allele_table = allele_table.applymap( + lambda s: "B5101" if s == "B51" else s) + allele_table = allele_table.applymap(normalize_allele_name) + + sample_info = metadata.drop_duplicates( + "Clinical ID").set_index("Clinical ID")[['Cancer type', 'IP Ab']] + sample_info = sample_info.loc[~sample_info.index.isnull()].fillna( + method='ffill') + sample_info = sample_info.loc[sample_info.index.isin(allele_table.index)] + sample_info = sample_info.loc[allele_table.index] + sample_info["hla"] = [" ".join(row) for _, row in allele_table.iterrows()] + sample_info["sample_type"] = sample_info['Cancer type'].map({ + 'CLL': "B-CELL", + 'GBM': "GLIOBLASTOMA_TISSUE", + 'Melanoma': "MELANOMA", + "Ovarian": "OVARY", + 'ccRCC': "KIDNEY", + }) + assert not sample_info["sample_type"].isnull().any() + assert not "UNKNOWN" in sample_info["hla"].any() + + for (key, value) in multi.items(): + if key == 'Tissue Sample Characteristics': + continue + for (directory, sub_df) in value.groupby("directory"): + if 'Pat7' in directory or 'Pat9' in directory: + print("Skipping due to no HLA typing", directory) + continue + try: + (sample_id,) = sample_info.loc[ + sample_info.index.map( + lambda idx: ( + idx in directory or + idx.replace("-", "_").replace("MEL_", "") in directory or + idx.replace(" ", "_") in directory + )) + ].index + except ValueError as e: + print(directory, e) + import ipdb ; ipdb.set_trace() + info = sample_info.loc[sample_id] + df = pandas.DataFrame({"peptide": sub_df.sequence.values}) + df["sample_id"] = "keskin_%s" % sample_id.replace(" ", "_") + df["hla"] = info['hla'] + df["pulldown_antibody"] = info['IP Ab'] + df["format"] = "multiallelic" + df["mhc_class"] = "I" + df["sample_type"] = info['sample_type'] + df["cell_line"] = None + dfs.append(df) + + result_df = pandas.concat(dfs, ignore_index=True) + result_df["peptide"] = result_df.peptide.str.upper() + return result_df + + EXPRESSION_GROUPS_ROWS = [] @@ -815,6 +905,7 @@ def handle_expression_expression_atlas_22460905(filename): "sample_type:B-CELL": matches("b-cell"), "sample_type:B721-LIKE": matches("b-cell"), "sample_type:MELANOMA_CELL_LINE": matches("melanoma"), + "sample_type:MELANOMA": matches("melanoma"), "sample_type:A375-LIKE": matches("melanoma"), "sample_type:KG1-LIKE": matches("myeloid leukemia"), @@ -888,6 +979,8 @@ def handle_expression_human_protein_atlas(*filenames): groups={ "sample_type:LUNG": ["lung"], "sample_type:SPLEEN": ["spleen"], + "sample_type:OVARY": ["ovary"], + "sample_type:KIDNEY": ["kidney"], }), ] diff --git a/downloads-generation/data_published/GENERATE.sh b/downloads-generation/data_published/GENERATE.sh index f10b85b6563d2000e06bd87139a62947d1e15f46..11bdacd2fd8e6b4d5fea0ccffdc6fe8667a4c4c6 100755 --- a/downloads-generation/data_published/GENERATE.sh +++ b/downloads-generation/data_published/GENERATE.sh @@ -36,7 +36,7 @@ wget -q https://github.com/openvax/mhcflurry/releases/download/pre-1.1/bdata.201 mkdir ms ############################################ -# MS: Multiallelic class I +# MS: Class I ############################################ # Bassani-Sternberg, ..., Gfeller PLOS Comp. Bio. 2017 [PMID 28832583] # The first dataset is from this work. The second dataset is originally from: @@ -106,14 +106,24 @@ PMID=27869121 mkdir -p ms/$PMID wget -q "https://static-content.springer.com/esm/art%3A10.1038%2Fncomms13404/MediaObjects/41467_2016_BFncomms13404_MOESM1318_ESM.xlsx" -P ms/$PMID +# Sarkizova, ..., Keskin Nature Biotechnology 2019 [PMID 31844290] +PMID=31844290 +mkdir -p ms/$PMID +# Monoallelic: +wget -q "https://static-content.springer.com/esm/art%3A10.1038%2Fs41587-019-0322-9/MediaObjects/41587_2019_322_MOESM3_ESM.xlsx" -P ms/$PMID +# Multiallelic: +wget -q "https://static-content.springer.com/esm/art%3A10.1038%2Fs41587-019-0322-9/MediaObjects/41587_2019_322_MOESM4_ESM.xlsx" -P ms/$PMID + + ############################################ -# MS: Monoallelic class II +# MS: Class II ############################################ # Abelin, ..., Rooney Immunity 2019 [PMID 31495665] PMID=31495665 mkdir -p ms/$PMID wget -q https://ars.els-cdn.com/content/image/1-s2.0-S1074761319303632-mmc2.xlsx -P ms/$PMID + ############################################ # RNA-seq expression data (TPMs) ############################################ diff --git a/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh index d8b70a64532a95ee399aced3412e0ed0d1e1e944..4706e28316bb6f279d7cc2dd0d23e36f929f776e 100755 --- a/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh +++ b/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh @@ -47,7 +47,7 @@ then python generate_hyperparameters.py > hyperparameters.yaml fi -for kind in with_mass_spec no_mass_spec +for kind in combined do EXTRA_TRAIN_ARGS="" if [ "$1" == "continue-incomplete" ] && [ -d "models.${kind}" ] @@ -57,7 +57,7 @@ do fi mhcflurry-class1-train-pan-allele-models \ - --data "$(mhcflurry-downloads path data_curated)/curated_training_data.${kind}.csv.bz2" \ + --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ --allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \ --pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \ --held-out-measurements-per-allele-fraction-and-max 0.25 100 \ diff --git a/downloads-generation/models_class1_pan_unselected/GENERATE.sh b/downloads-generation/models_class1_pan_unselected/GENERATE.sh index c5799bb9c7fa43b23d7011a5f0fddb972cce4cbb..b6f2efae4e5cb259ff36ff6db02ced02e5c7e5a9 100755 --- a/downloads-generation/models_class1_pan_unselected/GENERATE.sh +++ b/downloads-generation/models_class1_pan_unselected/GENERATE.sh @@ -59,7 +59,7 @@ then python generate_hyperparameters.py > hyperparameters.yaml fi -for kind in with_mass_spec no_mass_spec +for kind in combined do EXTRA_TRAIN_ARGS="" if [ "$1" == "continue-incomplete" ] && [ -d "models.${kind}" ] @@ -69,7 +69,7 @@ do fi mhcflurry-class1-train-pan-allele-models \ - --data "$(mhcflurry-downloads path data_curated)/curated_training_data.${kind}.csv.bz2" \ + --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ --allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \ --pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \ --held-out-measurements-per-allele-fraction-and-max 0.25 100 \ diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index 1200b72e97e011942148638ad0641cd7ad435939..59a163587f4a310f4382aeeecd47429a49beb057 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -8,7 +8,7 @@ # by name, the downloads with "default=true" are downloaded. # This should usually be the latest release. -current-release: 1.4.0 +current-release: 1.5.0 # An integer indicating what models the current MHCflurry code base is compatible # with. Increment this integer when changes are made to MHCflurry that would break @@ -17,6 +17,89 @@ current-compatibility-version: 2 # Add new releases here as they are made. releases: + 1.5.0: + compatibility-version: 2 + downloads: + - name: models_class1_pan + url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/models_class1_pan.20190928.tar.bz2 + default: false + + - name: models_class1_pan_unselected + part_urls: + - https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/models_class1_pan_unselected.20190924.tar.bz2.part.aa + default: false + + - name: models_class1_pan_refined + url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/models_class1_pan_refined.20191212c.tar.bz2 + default: false + + - name: models_class1_pan_variants + part_urls: + - https://github.com/openvax/mhcflurry/releases/download/1.4.0/models_class1_pan_variants.20191101.tar.bz2.part.aa + - https://github.com/openvax/mhcflurry/releases/download/1.4.0/models_class1_pan_variants.20191101.tar.bz2.part.ab + default: false + + - name: data_mass_spec_benchmark + url: https://www.dropbox.com/s/4wzotlnl58i1w32/data_mass_spec_benchmark.20191027.tar.bz2?dl=1 + default: false + + - name: data_mass_spec_annotated + url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_mass_spec_annotated.20191030.tar.bz2 + default: false + + - name: data_references + url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2 + default: false + + - name: data_iedb + url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_iedb.20191220.tar.bz2 + default: false + + - name: data_systemhcatlas + url: http://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_systemhcatlas.20190506.tar.bz2 + default: false + + - name: allele_sequences + url: http://github.com/openvax/mhcflurry/releases/download/pan-dev1/allele_sequences.20190506.tar.bz2 + default: false + + - name: random_peptide_predictions + url: http://github.com/openvax/mhcflurry/releases/download/pan-dev1/random_peptide_predictions.20190506.tar.bz2 + default: false + + - name: data_published + url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_published.20191220.tar.bz2 + default: false + + - name: data_curated + url: https://github.com/openvax/mhcflurry/releases/download/1.4.0/data_curated.20191030.tar.bz2 + default: true + + # Older downloads + - name: models_class1 + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.2/models_class1.20180225.tar.bz2 + default: true + + - name: models_class1_selected_no_mass_spec + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.2/models_class1_selected_no_mass_spec.20180225.tar.bz2 + default: false + + - name: models_class1_unselected + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.2/models_class1_unselected.20180221.tar.bz2 + default: false + + - name: models_class1_trained_with_mass_spec + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.2.1/models_class1_trained_with_mass_spec.20180228.tar.bz2 + default: false + + - name: models_class1_unselected_with_mass_spec + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.2.1/models_class1_unselected_with_mass_spec.20180227.tar.bz2 + default: false + + - name: models_class1_minimal + url: http://github.com/openvax/mhcflurry/releases/download/pre-1.2/models_class1_minimal.20180226.tar.bz2 + default: false + 1.4.0: compatibility-version: 2 downloads: