Skip to content
Snippets Groups Projects
Commit d0a9e30c authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

update

parent 9022729d
No related branches found
No related tags found
No related merge requests found
......@@ -34,11 +34,14 @@ wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebas
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.fasta.gz
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.miss.gz
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_additional.fasta.gz
wget -q ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz
python process.py \
UP000005640_9606.fasta.gz UP000005640_9606_additional.fasta.gz \
--out-csv uniprot_proteins.csv --out-index uniprot_proteins.fm
--id-mapping UP000005640_9606.idmapping.gz \
--ensembl-gtf Homo_sapiens.GRCh38.98.gtf.gz \
--out-csv uniprot_proteins.csv \
--out-index uniprot_proteins.fm
ls -lh uniprot_proteins.csv uniprot_proteins.fm
......
......@@ -6,6 +6,8 @@ import os
import gzip
import pandas
import gtfparse
import shellinford
from Bio import SeqIO
......@@ -25,7 +27,16 @@ parser.add_argument(
required=True,
metavar="FILE.fm",
help="Index output")
parser.add_argument(
"--id-mapping",
required=True,
metavar="FILE.idmapping.gz",
help="Uniprot mapping file")
parser.add_argument(
"--ensembl-gtf",
required=True,
metavar="FILE.gtf.gz",
help="Ensembl GTF file")
def run():
args = parser.parse_args(sys.argv[1:])
......@@ -45,14 +56,54 @@ def run():
print("Done reading fastas")
print(df)
pieces = df.name.str.split("|")
df["db"] = pieces.str.get(0)
df["accession"] = pieces.str.get(1)
df["entry"] = pieces.str.get(2)
print("Annotating using mapping", args.id_mapping)
mapping_df = pandas.read_csv(
args.id_mapping, sep="\t", header=None)
mapping_df.columns = ['accession', 'key', 'value']
for item in ["Ensembl", "Ensembl_TRS", "Gene_Name"]:
accession_to_values = mapping_df.loc[
mapping_df.key == item
].groupby("accession").value.unique().map(" ".join)
df[item.lower()] = df.accession.map(accession_to_values)
print("Annotating using gtf", args.ensembl_gtf)
gtf_df = gtfparse.read_gtf(args.ensembl_gtf)
matching_ensembl_genes = set(gtf_df.gene_id.unique())
ensembl_primary = []
for ensembls in df.ensembl.fillna("").str.split():
result = ""
for item in ensembls:
if item in matching_ensembl_genes:
result = item
break
ensembl_primary.append(result)
df["ensembl_primary"] = ensembl_primary
print("Fraction of records with matching ensembl genes", (
df.ensembl_primary != "").mean())
gene_records = gtf_df.loc[gtf_df.feature == "gene"].set_index("gene_id")
df["primary_ensembl_contig"] = df.ensembl_primary.map(gene_records.seqname)
df["primary_ensembl_start"] = df.ensembl_primary.map(gene_records.start)
df["primary_ensembl_end"] = df.ensembl_primary.map(gene_records.end)
df["primary_ensembl_strand"] = df.ensembl_primary.map(gene_records.strand)
print("Done annotating")
print(df)
df.to_csv(args.out_csv, index=True)
print("Wrote: ", os.path.abspath((args.out_csv)))
print("Building index")
fm.build()
fm.write(args.out_index)
print("Wrote: ", os.path.abspath((args.out_index)))
df.to_csv(args.out_csv, index=True)
print("Wrote: ", os.path.abspath((args.out_csv)))
if __name__ == '__main__':
run()
shellinford
biopython
gtfparse
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment