From 782a36ef8a7557067dfaafa4ff581f61f168a248 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Fri, 27 Sep 2019 16:03:11 -0400 Subject: [PATCH] update --- .../data_references/GENERATE.sh | 7 ++- .../data_references/process.py | 59 +++++++++++++++++-- .../data_references/requirements.txt | 2 + 3 files changed, 62 insertions(+), 6 deletions(-) diff --git a/downloads-generation/data_references/GENERATE.sh b/downloads-generation/data_references/GENERATE.sh index 00575b08..9ac3a3c6 100755 --- a/downloads-generation/data_references/GENERATE.sh +++ b/downloads-generation/data_references/GENERATE.sh @@ -34,11 +34,14 @@ wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebas wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.fasta.gz wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.miss.gz wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_additional.fasta.gz - +wget -q ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz python process.py \ UP000005640_9606.fasta.gz UP000005640_9606_additional.fasta.gz \ - --out-csv uniprot_proteins.csv --out-index uniprot_proteins.fm + --id-mapping UP000005640_9606.idmapping.gz \ + --ensembl-gtf Homo_sapiens.GRCh38.98.gtf.gz \ + --out-csv uniprot_proteins.csv \ + --out-index uniprot_proteins.fm ls -lh uniprot_proteins.csv uniprot_proteins.fm diff --git a/downloads-generation/data_references/process.py b/downloads-generation/data_references/process.py index 8d03464e..92ac090b 100755 --- a/downloads-generation/data_references/process.py +++ b/downloads-generation/data_references/process.py @@ -6,6 +6,8 @@ import os import gzip import pandas + +import gtfparse import shellinford from Bio import SeqIO @@ -25,7 +27,16 @@ parser.add_argument( required=True, metavar="FILE.fm", help="Index output") - +parser.add_argument( + "--id-mapping", + required=True, + metavar="FILE.idmapping.gz", + help="Uniprot mapping file") +parser.add_argument( + "--ensembl-gtf", + required=True, + metavar="FILE.gtf.gz", + help="Ensembl GTF file") def run(): args = parser.parse_args(sys.argv[1:]) @@ -45,14 +56,54 @@ def run(): print("Done reading fastas") print(df) + pieces = df.name.str.split("|") + df["db"] = pieces.str.get(0) + df["accession"] = pieces.str.get(1) + df["entry"] = pieces.str.get(2) + + print("Annotating using mapping", args.id_mapping) + mapping_df = pandas.read_csv( + args.id_mapping, sep="\t", header=None) + mapping_df.columns = ['accession', 'key', 'value'] + + for item in ["Ensembl", "Ensembl_TRS", "Gene_Name"]: + accession_to_values = mapping_df.loc[ + mapping_df.key == item + ].groupby("accession").value.unique().map(" ".join) + df[item.lower()] = df.accession.map(accession_to_values) + + print("Annotating using gtf", args.ensembl_gtf) + gtf_df = gtfparse.read_gtf(args.ensembl_gtf) + matching_ensembl_genes = set(gtf_df.gene_id.unique()) + ensembl_primary = [] + for ensembls in df.ensembl.fillna("").str.split(): + result = "" + for item in ensembls: + if item in matching_ensembl_genes: + result = item + break + ensembl_primary.append(result) + df["ensembl_primary"] = ensembl_primary + print("Fraction of records with matching ensembl genes", ( + df.ensembl_primary != "").mean()) + + gene_records = gtf_df.loc[gtf_df.feature == "gene"].set_index("gene_id") + df["primary_ensembl_contig"] = df.ensembl_primary.map(gene_records.seqname) + df["primary_ensembl_start"] = df.ensembl_primary.map(gene_records.start) + df["primary_ensembl_end"] = df.ensembl_primary.map(gene_records.end) + df["primary_ensembl_strand"] = df.ensembl_primary.map(gene_records.strand) + + print("Done annotating") + print(df) + + df.to_csv(args.out_csv, index=True) + print("Wrote: ", os.path.abspath((args.out_csv))) + print("Building index") fm.build() fm.write(args.out_index) print("Wrote: ", os.path.abspath((args.out_index))) - df.to_csv(args.out_csv, index=True) - print("Wrote: ", os.path.abspath((args.out_csv))) - if __name__ == '__main__': run() diff --git a/downloads-generation/data_references/requirements.txt b/downloads-generation/data_references/requirements.txt index fd4acf48..fd49a334 100644 --- a/downloads-generation/data_references/requirements.txt +++ b/downloads-generation/data_references/requirements.txt @@ -1,2 +1,4 @@ shellinford biopython +gtfparse + -- GitLab