diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh index 6e88bc7fcecc1866e5495628db1ebbff1accace4..fd041f93db8468eac835da4110f2f359ad90e7fa 100755 --- a/downloads-generation/data_curated/GENERATE.sh +++ b/downloads-generation/data_curated/GENERATE.sh @@ -40,7 +40,10 @@ do CURATE_BY_PMID_ARGS+=$(echo --item $pmid raw/$pmid/* ' ') done -time python curate_by_pmid.py $CURATE_BY_PMID_ARGS --out curated.by_pmid.csv --debug +time python curate_by_pmid.py $CURATE_BY_PMID_ARGS \ + --out nontraining_curated.by_pmid.csv + +bzip2 nontraining_curated.by_pmid.csv rm -rf raw diff --git a/downloads-generation/data_curated/curate_by_pmid.py b/downloads-generation/data_curated/curate_by_pmid.py index 5d62f62b0a6cbad1bca0cefb1eadd0d286c15c2e..4f1950e5e15e2ef40ce1d7b8cf7199c08a4c6405 100755 --- a/downloads-generation/data_curated/curate_by_pmid.py +++ b/downloads-generation/data_curated/curate_by_pmid.py @@ -126,6 +126,12 @@ def handle_pmid_23481700(filename): results.append(result_df) result_df = pandas.concat(results, ignore_index=True) + + # Rename samples to avoid a collision with the JY sample in PMID 25576301. + result_df.sample_id = result_df.sample_id.map({ + "JY": "JY.2015", + "HHC": "HHC.2015", + }) return result_df @@ -666,6 +672,10 @@ def run(): else: print("No nulls.") + # Each sample should be coming from only one experiment. + assert df.groupby("sample_id").pmid.nunique().max() == 1, ( + df.groupby("sample_id").pmid.nunique().sort_values()) + df.to_csv(args.out, index=False) print("Wrote: %s" % os.path.abspath(args.out)) diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index fe5e6f28924fca4a6cfa2509001ba8ba7b1abca1..3e05c91e5b5ddf92045db2feff828e7dde51f132 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -50,7 +50,7 @@ releases: default: false - name: data_curated - url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_curated.20190925.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_curated.20190927.tar.bz2 default: true # Older downloads