From 782a36ef8a7557067dfaafa4ff581f61f168a248 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Fri, 27 Sep 2019 16:03:11 -0400
Subject: [PATCH] update

---
 .../data_references/GENERATE.sh               |  7 ++-
 .../data_references/process.py                | 59 +++++++++++++++++--
 .../data_references/requirements.txt          |  2 +
 3 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/downloads-generation/data_references/GENERATE.sh b/downloads-generation/data_references/GENERATE.sh
index 00575b08..9ac3a3c6 100755
--- a/downloads-generation/data_references/GENERATE.sh
+++ b/downloads-generation/data_references/GENERATE.sh
@@ -34,11 +34,14 @@ wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebas
 wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.fasta.gz
 wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.miss.gz
 wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_additional.fasta.gz
-
+wget -q ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz
 
 python process.py \
     UP000005640_9606.fasta.gz UP000005640_9606_additional.fasta.gz \
-    --out-csv uniprot_proteins.csv --out-index uniprot_proteins.fm
+    --id-mapping UP000005640_9606.idmapping.gz \
+    --ensembl-gtf Homo_sapiens.GRCh38.98.gtf.gz \
+    --out-csv uniprot_proteins.csv \
+    --out-index uniprot_proteins.fm
 
 ls -lh uniprot_proteins.csv uniprot_proteins.fm
 
diff --git a/downloads-generation/data_references/process.py b/downloads-generation/data_references/process.py
index 8d03464e..92ac090b 100755
--- a/downloads-generation/data_references/process.py
+++ b/downloads-generation/data_references/process.py
@@ -6,6 +6,8 @@ import os
 import gzip
 
 import pandas
+
+import gtfparse
 import shellinford
 from Bio import SeqIO
 
@@ -25,7 +27,16 @@ parser.add_argument(
     required=True,
     metavar="FILE.fm",
     help="Index output")
-
+parser.add_argument(
+    "--id-mapping",
+    required=True,
+    metavar="FILE.idmapping.gz",
+    help="Uniprot mapping file")
+parser.add_argument(
+    "--ensembl-gtf",
+    required=True,
+    metavar="FILE.gtf.gz",
+    help="Ensembl GTF file")
 
 def run():
     args = parser.parse_args(sys.argv[1:])
@@ -45,14 +56,54 @@ def run():
     print("Done reading fastas")
     print(df)
 
+    pieces = df.name.str.split("|")
+    df["db"] = pieces.str.get(0)
+    df["accession"] = pieces.str.get(1)
+    df["entry"] = pieces.str.get(2)
+
+    print("Annotating using mapping", args.id_mapping)
+    mapping_df = pandas.read_csv(
+        args.id_mapping, sep="\t", header=None)
+    mapping_df.columns = ['accession', 'key', 'value']
+
+    for item in ["Ensembl", "Ensembl_TRS", "Gene_Name"]:
+        accession_to_values = mapping_df.loc[
+            mapping_df.key == item
+        ].groupby("accession").value.unique().map(" ".join)
+        df[item.lower()] = df.accession.map(accession_to_values)
+
+    print("Annotating using gtf", args.ensembl_gtf)
+    gtf_df = gtfparse.read_gtf(args.ensembl_gtf)
+    matching_ensembl_genes = set(gtf_df.gene_id.unique())
+    ensembl_primary = []
+    for ensembls in df.ensembl.fillna("").str.split():
+        result = ""
+        for item in ensembls:
+            if item in matching_ensembl_genes:
+                result = item
+                break
+        ensembl_primary.append(result)
+    df["ensembl_primary"] = ensembl_primary
+    print("Fraction of records with matching ensembl genes", (
+            df.ensembl_primary != "").mean())
+
+    gene_records = gtf_df.loc[gtf_df.feature == "gene"].set_index("gene_id")
+    df["primary_ensembl_contig"] = df.ensembl_primary.map(gene_records.seqname)
+    df["primary_ensembl_start"] = df.ensembl_primary.map(gene_records.start)
+    df["primary_ensembl_end"] = df.ensembl_primary.map(gene_records.end)
+    df["primary_ensembl_strand"] = df.ensembl_primary.map(gene_records.strand)
+
+    print("Done annotating")
+    print(df)
+
+    df.to_csv(args.out_csv, index=True)
+    print("Wrote: ", os.path.abspath((args.out_csv)))
+
     print("Building index")
     fm.build()
     fm.write(args.out_index)
     print("Wrote: ", os.path.abspath((args.out_index)))
 
-    df.to_csv(args.out_csv, index=True)
-    print("Wrote: ", os.path.abspath((args.out_csv)))
-
 
 if __name__ == '__main__':
     run()
diff --git a/downloads-generation/data_references/requirements.txt b/downloads-generation/data_references/requirements.txt
index fd4acf48..fd49a334 100644
--- a/downloads-generation/data_references/requirements.txt
+++ b/downloads-generation/data_references/requirements.txt
@@ -1,2 +1,4 @@
 shellinford
 biopython
+gtfparse
+
-- 
GitLab