Skip to content
Snippets Groups Projects
Commit 782a36ef authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

update

parent 9192933a
No related merge requests found
......@@ -34,11 +34,14 @@ wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebas
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.fasta.gz
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.miss.gz
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_additional.fasta.gz
wget -q ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz
python process.py \
UP000005640_9606.fasta.gz UP000005640_9606_additional.fasta.gz \
--out-csv uniprot_proteins.csv --out-index uniprot_proteins.fm
--id-mapping UP000005640_9606.idmapping.gz \
--ensembl-gtf Homo_sapiens.GRCh38.98.gtf.gz \
--out-csv uniprot_proteins.csv \
--out-index uniprot_proteins.fm
ls -lh uniprot_proteins.csv uniprot_proteins.fm
......
......@@ -6,6 +6,8 @@ import os
import gzip
import pandas
import gtfparse
import shellinford
from Bio import SeqIO
......@@ -25,7 +27,16 @@ parser.add_argument(
required=True,
metavar="FILE.fm",
help="Index output")
parser.add_argument(
"--id-mapping",
required=True,
metavar="FILE.idmapping.gz",
help="Uniprot mapping file")
parser.add_argument(
"--ensembl-gtf",
required=True,
metavar="FILE.gtf.gz",
help="Ensembl GTF file")
def run():
args = parser.parse_args(sys.argv[1:])
......@@ -45,14 +56,54 @@ def run():
print("Done reading fastas")
print(df)
pieces = df.name.str.split("|")
df["db"] = pieces.str.get(0)
df["accession"] = pieces.str.get(1)
df["entry"] = pieces.str.get(2)
print("Annotating using mapping", args.id_mapping)
mapping_df = pandas.read_csv(
args.id_mapping, sep="\t", header=None)
mapping_df.columns = ['accession', 'key', 'value']
for item in ["Ensembl", "Ensembl_TRS", "Gene_Name"]:
accession_to_values = mapping_df.loc[
mapping_df.key == item
].groupby("accession").value.unique().map(" ".join)
df[item.lower()] = df.accession.map(accession_to_values)
print("Annotating using gtf", args.ensembl_gtf)
gtf_df = gtfparse.read_gtf(args.ensembl_gtf)
matching_ensembl_genes = set(gtf_df.gene_id.unique())
ensembl_primary = []
for ensembls in df.ensembl.fillna("").str.split():
result = ""
for item in ensembls:
if item in matching_ensembl_genes:
result = item
break
ensembl_primary.append(result)
df["ensembl_primary"] = ensembl_primary
print("Fraction of records with matching ensembl genes", (
df.ensembl_primary != "").mean())
gene_records = gtf_df.loc[gtf_df.feature == "gene"].set_index("gene_id")
df["primary_ensembl_contig"] = df.ensembl_primary.map(gene_records.seqname)
df["primary_ensembl_start"] = df.ensembl_primary.map(gene_records.start)
df["primary_ensembl_end"] = df.ensembl_primary.map(gene_records.end)
df["primary_ensembl_strand"] = df.ensembl_primary.map(gene_records.strand)
print("Done annotating")
print(df)
df.to_csv(args.out_csv, index=True)
print("Wrote: ", os.path.abspath((args.out_csv)))
print("Building index")
fm.build()
fm.write(args.out_index)
print("Wrote: ", os.path.abspath((args.out_index)))
df.to_csv(args.out_csv, index=True)
print("Wrote: ", os.path.abspath((args.out_csv)))
if __name__ == '__main__':
run()
shellinford
biopython
gtfparse
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment