From 15b85d7c23b095f213a69588451ee5784716bb67 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Fri, 27 Sep 2019 14:38:56 -0400 Subject: [PATCH] fix --- downloads-generation/data_curated/GENERATE.sh | 5 ++++- downloads-generation/data_curated/curate_by_pmid.py | 10 ++++++++++ mhcflurry/downloads.yml | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh index 6e88bc7f..fd041f93 100755 --- a/downloads-generation/data_curated/GENERATE.sh +++ b/downloads-generation/data_curated/GENERATE.sh @@ -40,7 +40,10 @@ do CURATE_BY_PMID_ARGS+=$(echo --item $pmid raw/$pmid/* ' ') done -time python curate_by_pmid.py $CURATE_BY_PMID_ARGS --out curated.by_pmid.csv --debug +time python curate_by_pmid.py $CURATE_BY_PMID_ARGS \ + --out nontraining_curated.by_pmid.csv + +bzip2 nontraining_curated.by_pmid.csv rm -rf raw diff --git a/downloads-generation/data_curated/curate_by_pmid.py b/downloads-generation/data_curated/curate_by_pmid.py index 5d62f62b..4f1950e5 100755 --- a/downloads-generation/data_curated/curate_by_pmid.py +++ b/downloads-generation/data_curated/curate_by_pmid.py @@ -126,6 +126,12 @@ def handle_pmid_23481700(filename): results.append(result_df) result_df = pandas.concat(results, ignore_index=True) + + # Rename samples to avoid a collision with the JY sample in PMID 25576301. + result_df.sample_id = result_df.sample_id.map({ + "JY": "JY.2015", + "HHC": "HHC.2015", + }) return result_df @@ -666,6 +672,10 @@ def run(): else: print("No nulls.") + # Each sample should be coming from only one experiment. + assert df.groupby("sample_id").pmid.nunique().max() == 1, ( + df.groupby("sample_id").pmid.nunique().sort_values()) + df.to_csv(args.out, index=False) print("Wrote: %s" % os.path.abspath(args.out)) diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index fe5e6f28..3e05c91e 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -50,7 +50,7 @@ releases: default: false - name: data_curated - url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_curated.20190925.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_curated.20190927.tar.bz2 default: true # Older downloads -- GitLab