diff --git a/downloads-generation/data_curated/curate_by_pmid.py b/downloads-generation/data_curated/curate_by_pmid.py index 7f2617770f09e4df21c6c4936bf532d0f914fa8b..bf9ce98187f61bfb0c7eefecbed42b00b93bfa7a 100755 --- a/downloads-generation/data_curated/curate_by_pmid.py +++ b/downloads-generation/data_curated/curate_by_pmid.py @@ -5,6 +5,7 @@ optionally including eluted peptides identified by mass-spec. import sys import argparse import os +import collections import pandas @@ -122,28 +123,58 @@ def handle_pmid_25576301(filename): assert peptides[0] == "AAAAAAAQSVY" assert peptides[-1] == "YYYNGKAVY" - # TODO TODO - import ipdb ; ipdb.set_trace() + column_to_sample = {} + for s in [c for c in df if c.startswith("Intensity ")]: + assert s[-2] == "-" + column_to_sample[s] = s.replace("Intensity ", "")[:-2].strip() - # THIS IS ALL JUNK: - result = pandas.DataFrame({ - "peptide": peptides, - }) - result["sample_id"] = "24616531" - result["sample_type"] = "B-lymphoblastoid" - result["cell_line"] = "GR" + intensity_columns = list(column_to_sample) + + rows = [] + for _, row in df.iterrows(): + x1 = row[intensity_columns] + x2 = x1[x1 > 0].index.map(column_to_sample).value_counts() + x3 = x2[x2 >= 2] # require at least two replicates for each peptide + for sample in x3.index: + rows.append((row.Sequence, sample)) + + result = pandas.DataFrame(rows, columns=["peptide", "sample_id"]) + result["cell_line"] = "" result["pulldown_antibody"] = "W6/32" - # Note: this publication lists hla as "HLA-A*01,-03, B*07,-27, and -C*02,-07" - # we are guessing the exact 4 digit alleles based on this. - result["hla"] = "HLA-A*01:01 HLA-A*03:01 HLA-B*07:02 HLA-B*27:05 HLA-C*02:02 HLA-C*07:01" + allele_map = { + 'Fib': "HLA-A*03:01 HLA-A*23:01 HLA-B*08:01 HLA-B*15:18 HLA-C*07:02 HLA-C*07:04", + 'HCC1937': "HLA-A*23:01 HLA-A*24:02 HLA-B*07:02 HLA-B*40:01 HLA-C*03:04 HLA-C*07:02", + 'SupB15WT': None, # four digit alleles unknown, will drop sample + 'SupB15RT': None, + 'HCT116': "HLA-A*01:01 HLA-A*02:01 HLA-B*45:01 HLA-B*18:01 HLA-C*05:01 HLA-C*07:01", + + # Homozygous at HLA-A: + 'HCC1143': "HLA-A*31:01 HLA-A*31:01 HLA-B*35:08 HLA-B*37:01 HLA-C*04:01 HLA-C*06:02", + + # Homozygous everywhere: + 'JY': "HLA-A*02:01 HLA-A*02:01 HLA-B*07:02 HLA-B*07:02 HLA-C*07:02 HLA-C*07:02", + } + + sample_type = { + 'Fib': "fibroblast", + 'HCC1937': "basal like breast cancer", + 'SupB15WT': None, + 'SupB15RT': None, + 'HCT116': "colon carcinoma", + 'HCC1143': "basal like breast cancer", + 'JY': "B-cell", + } + result["hla"] = result.sample_id.map(allele_map) + print("Entries before dropping samples with unknown alleles", len(result)) + result = result.loc[~result.hla.isnull()] + print("Entries after dropping samples with unknown alleles", len(result)) + result["sample_type"] = result.sample_id.map(sample_type) + print(result.head(3)) return result - - - # Hack to add all functions with names like handle_pmid_XXXX to HANDLERS dict. for (key, value) in list(locals().items()): if key.startswith("handle_pmid_"):