diff --git a/downloads-generation/data_curated/README.md b/downloads-generation/data_curated/README.md
index 7ac75d2026a99dd6c6344a3130cae05c74e12c57..25227beed78ecfb4c0e3425af556497cd26a46f2 100644
--- a/downloads-generation/data_curated/README.md
+++ b/downloads-generation/data_curated/README.md
@@ -1,9 +1,10 @@
 # Combined training data
 
-This download contains the data used to train the production class1 MHCflurry models. This data is derived from a recent [IEDB](http://www.iedb.org/home_v3.php) export as well as the data from [Kim 2014](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241). 
+This download contains the data used to train the production class1 MHCflurry models. This data is derived from a recent [IEDB](http://www.iedb.org/home_v3.php) export as well as the data from [Kim 2014](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241), as well as a number of other sources.
 
 To generate this download run:
 
 ```
+pip install -r requirements.txt  # for the first time you generate this download
 ./GENERATE.sh
-```
\ No newline at end of file
+```
diff --git a/downloads-generation/data_curated/curate_by_pmid.py b/downloads-generation/data_curated/curate_by_pmid.py
index 4d478b9de14402f0cd93a9a4bf92af5e2eeeecdb..5d62f62b0a6cbad1bca0cefb1eadd0d286c15c2e 100755
--- a/downloads-generation/data_curated/curate_by_pmid.py
+++ b/downloads-generation/data_curated/curate_by_pmid.py
@@ -229,7 +229,7 @@ def handle_pmid_26992070(*filenames):
         for num in ["1", "2"]:
             allele_info[
                 "HLA-%s %s" % (gene, num)
-            ] = "HLA-" + gene + allele_info["HLA-%s %s" % (gene, num)]
+            ] = "HLA-" + gene + "*" + allele_info["HLA-%s %s" % (gene, num)]
     cell_line_to_allele = allele_info.apply(" ".join, axis=1)
 
     sheets = {}
@@ -585,7 +585,7 @@ def handle_pmid_31495665(filename):
         "MAPTAC_DRB3*01:01_dm-": "mixed",
     }
 
-    df = pandas.read_excel(filename, sheetname="DataS1B")
+    df = pandas.read_excel(filename, sheet_name="DataS1B")
     results = []
     for sample_id in df.columns:
         if hla_type[sample_id] is None: