diff --git a/downloads-generation/data_curated/README.md b/downloads-generation/data_curated/README.md index 7ac75d2026a99dd6c6344a3130cae05c74e12c57..25227beed78ecfb4c0e3425af556497cd26a46f2 100644 --- a/downloads-generation/data_curated/README.md +++ b/downloads-generation/data_curated/README.md @@ -1,9 +1,10 @@ # Combined training data -This download contains the data used to train the production class1 MHCflurry models. This data is derived from a recent [IEDB](http://www.iedb.org/home_v3.php) export as well as the data from [Kim 2014](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241). +This download contains the data used to train the production class1 MHCflurry models. This data is derived from a recent [IEDB](http://www.iedb.org/home_v3.php) export as well as the data from [Kim 2014](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241), as well as a number of other sources. To generate this download run: ``` +pip install -r requirements.txt # for the first time you generate this download ./GENERATE.sh -``` \ No newline at end of file +``` diff --git a/downloads-generation/data_curated/curate_by_pmid.py b/downloads-generation/data_curated/curate_by_pmid.py index 4d478b9de14402f0cd93a9a4bf92af5e2eeeecdb..5d62f62b0a6cbad1bca0cefb1eadd0d286c15c2e 100755 --- a/downloads-generation/data_curated/curate_by_pmid.py +++ b/downloads-generation/data_curated/curate_by_pmid.py @@ -229,7 +229,7 @@ def handle_pmid_26992070(*filenames): for num in ["1", "2"]: allele_info[ "HLA-%s %s" % (gene, num) - ] = "HLA-" + gene + allele_info["HLA-%s %s" % (gene, num)] + ] = "HLA-" + gene + "*" + allele_info["HLA-%s %s" % (gene, num)] cell_line_to_allele = allele_info.apply(" ".join, axis=1) sheets = {} @@ -585,7 +585,7 @@ def handle_pmid_31495665(filename): "MAPTAC_DRB3*01:01_dm-": "mixed", } - df = pandas.read_excel(filename, sheetname="DataS1B") + df = pandas.read_excel(filename, sheet_name="DataS1B") results = [] for sample_id in df.columns: if hla_type[sample_id] is None: