diff --git a/downloads-generation/data_references/GENERATE.sh b/downloads-generation/data_references/GENERATE.sh index 00575b083bc4518094a5aaa65f453ecb65ce9746..9ac3a3c6bd1f400b3ec594b7ecbb59cad1c22e47 100755 --- a/downloads-generation/data_references/GENERATE.sh +++ b/downloads-generation/data_references/GENERATE.sh @@ -34,11 +34,14 @@ wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebas wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.fasta.gz wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.miss.gz wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_additional.fasta.gz - +wget -q ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz python process.py \ UP000005640_9606.fasta.gz UP000005640_9606_additional.fasta.gz \ - --out-csv uniprot_proteins.csv --out-index uniprot_proteins.fm + --id-mapping UP000005640_9606.idmapping.gz \ + --ensembl-gtf Homo_sapiens.GRCh38.98.gtf.gz \ + --out-csv uniprot_proteins.csv \ + --out-index uniprot_proteins.fm ls -lh uniprot_proteins.csv uniprot_proteins.fm diff --git a/downloads-generation/data_references/process.py b/downloads-generation/data_references/process.py index 8d03464e7366b76308fd4adb590b8afa2c3dde64..92ac090b70d6180605130e36c3b369f5c67eb693 100755 --- a/downloads-generation/data_references/process.py +++ b/downloads-generation/data_references/process.py @@ -6,6 +6,8 @@ import os import gzip import pandas + +import gtfparse import shellinford from Bio import SeqIO @@ -25,7 +27,16 @@ parser.add_argument( required=True, metavar="FILE.fm", help="Index output") - +parser.add_argument( + "--id-mapping", + required=True, + metavar="FILE.idmapping.gz", + help="Uniprot mapping file") +parser.add_argument( + "--ensembl-gtf", + required=True, + metavar="FILE.gtf.gz", + help="Ensembl GTF file") def run(): args = parser.parse_args(sys.argv[1:]) @@ -45,14 +56,54 @@ def run(): print("Done reading fastas") print(df) + pieces = df.name.str.split("|") + df["db"] = pieces.str.get(0) + df["accession"] = pieces.str.get(1) + df["entry"] = pieces.str.get(2) + + print("Annotating using mapping", args.id_mapping) + mapping_df = pandas.read_csv( + args.id_mapping, sep="\t", header=None) + mapping_df.columns = ['accession', 'key', 'value'] + + for item in ["Ensembl", "Ensembl_TRS", "Gene_Name"]: + accession_to_values = mapping_df.loc[ + mapping_df.key == item + ].groupby("accession").value.unique().map(" ".join) + df[item.lower()] = df.accession.map(accession_to_values) + + print("Annotating using gtf", args.ensembl_gtf) + gtf_df = gtfparse.read_gtf(args.ensembl_gtf) + matching_ensembl_genes = set(gtf_df.gene_id.unique()) + ensembl_primary = [] + for ensembls in df.ensembl.fillna("").str.split(): + result = "" + for item in ensembls: + if item in matching_ensembl_genes: + result = item + break + ensembl_primary.append(result) + df["ensembl_primary"] = ensembl_primary + print("Fraction of records with matching ensembl genes", ( + df.ensembl_primary != "").mean()) + + gene_records = gtf_df.loc[gtf_df.feature == "gene"].set_index("gene_id") + df["primary_ensembl_contig"] = df.ensembl_primary.map(gene_records.seqname) + df["primary_ensembl_start"] = df.ensembl_primary.map(gene_records.start) + df["primary_ensembl_end"] = df.ensembl_primary.map(gene_records.end) + df["primary_ensembl_strand"] = df.ensembl_primary.map(gene_records.strand) + + print("Done annotating") + print(df) + + df.to_csv(args.out_csv, index=True) + print("Wrote: ", os.path.abspath((args.out_csv))) + print("Building index") fm.build() fm.write(args.out_index) print("Wrote: ", os.path.abspath((args.out_index))) - df.to_csv(args.out_csv, index=True) - print("Wrote: ", os.path.abspath((args.out_csv))) - if __name__ == '__main__': run() diff --git a/downloads-generation/data_references/requirements.txt b/downloads-generation/data_references/requirements.txt index fd4acf48d1d83f7d55e6c4cd20bfc5b3ccdb6449..fd49a3344d43a09fc16e4c2cbfbbaeb024e3c239 100644 --- a/downloads-generation/data_references/requirements.txt +++ b/downloads-generation/data_references/requirements.txt @@ -1,2 +1,4 @@ shellinford biopython +gtfparse +