diff --git a/downloads-generation/data_references/GENERATE.sh b/downloads-generation/data_references/GENERATE.sh
index 00575b083bc4518094a5aaa65f453ecb65ce9746..9ac3a3c6bd1f400b3ec594b7ecbb59cad1c22e47 100755
--- a/downloads-generation/data_references/GENERATE.sh
+++ b/downloads-generation/data_references/GENERATE.sh
@@ -34,11 +34,14 @@ wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebas
 wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.fasta.gz
 wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.miss.gz
 wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_additional.fasta.gz
-
+wget -q ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz
 
 python process.py \
     UP000005640_9606.fasta.gz UP000005640_9606_additional.fasta.gz \
-    --out-csv uniprot_proteins.csv --out-index uniprot_proteins.fm
+    --id-mapping UP000005640_9606.idmapping.gz \
+    --ensembl-gtf Homo_sapiens.GRCh38.98.gtf.gz \
+    --out-csv uniprot_proteins.csv \
+    --out-index uniprot_proteins.fm
 
 ls -lh uniprot_proteins.csv uniprot_proteins.fm
 
diff --git a/downloads-generation/data_references/process.py b/downloads-generation/data_references/process.py
index 8d03464e7366b76308fd4adb590b8afa2c3dde64..92ac090b70d6180605130e36c3b369f5c67eb693 100755
--- a/downloads-generation/data_references/process.py
+++ b/downloads-generation/data_references/process.py
@@ -6,6 +6,8 @@ import os
 import gzip
 
 import pandas
+
+import gtfparse
 import shellinford
 from Bio import SeqIO
 
@@ -25,7 +27,16 @@ parser.add_argument(
     required=True,
     metavar="FILE.fm",
     help="Index output")
-
+parser.add_argument(
+    "--id-mapping",
+    required=True,
+    metavar="FILE.idmapping.gz",
+    help="Uniprot mapping file")
+parser.add_argument(
+    "--ensembl-gtf",
+    required=True,
+    metavar="FILE.gtf.gz",
+    help="Ensembl GTF file")
 
 def run():
     args = parser.parse_args(sys.argv[1:])
@@ -45,14 +56,54 @@ def run():
     print("Done reading fastas")
     print(df)
 
+    pieces = df.name.str.split("|")
+    df["db"] = pieces.str.get(0)
+    df["accession"] = pieces.str.get(1)
+    df["entry"] = pieces.str.get(2)
+
+    print("Annotating using mapping", args.id_mapping)
+    mapping_df = pandas.read_csv(
+        args.id_mapping, sep="\t", header=None)
+    mapping_df.columns = ['accession', 'key', 'value']
+
+    for item in ["Ensembl", "Ensembl_TRS", "Gene_Name"]:
+        accession_to_values = mapping_df.loc[
+            mapping_df.key == item
+        ].groupby("accession").value.unique().map(" ".join)
+        df[item.lower()] = df.accession.map(accession_to_values)
+
+    print("Annotating using gtf", args.ensembl_gtf)
+    gtf_df = gtfparse.read_gtf(args.ensembl_gtf)
+    matching_ensembl_genes = set(gtf_df.gene_id.unique())
+    ensembl_primary = []
+    for ensembls in df.ensembl.fillna("").str.split():
+        result = ""
+        for item in ensembls:
+            if item in matching_ensembl_genes:
+                result = item
+                break
+        ensembl_primary.append(result)
+    df["ensembl_primary"] = ensembl_primary
+    print("Fraction of records with matching ensembl genes", (
+            df.ensembl_primary != "").mean())
+
+    gene_records = gtf_df.loc[gtf_df.feature == "gene"].set_index("gene_id")
+    df["primary_ensembl_contig"] = df.ensembl_primary.map(gene_records.seqname)
+    df["primary_ensembl_start"] = df.ensembl_primary.map(gene_records.start)
+    df["primary_ensembl_end"] = df.ensembl_primary.map(gene_records.end)
+    df["primary_ensembl_strand"] = df.ensembl_primary.map(gene_records.strand)
+
+    print("Done annotating")
+    print(df)
+
+    df.to_csv(args.out_csv, index=True)
+    print("Wrote: ", os.path.abspath((args.out_csv)))
+
     print("Building index")
     fm.build()
     fm.write(args.out_index)
     print("Wrote: ", os.path.abspath((args.out_index)))
 
-    df.to_csv(args.out_csv, index=True)
-    print("Wrote: ", os.path.abspath((args.out_csv)))
-
 
 if __name__ == '__main__':
     run()
diff --git a/downloads-generation/data_references/requirements.txt b/downloads-generation/data_references/requirements.txt
index fd4acf48d1d83f7d55e6c4cd20bfc5b3ccdb6449..fd49a3344d43a09fc16e4c2cbfbbaeb024e3c239 100644
--- a/downloads-generation/data_references/requirements.txt
+++ b/downloads-generation/data_references/requirements.txt
@@ -1,2 +1,4 @@
 shellinford
 biopython
+gtfparse
+