Skip to content
Snippets Groups Projects
Commit 782a36ef authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

update

parent 9192933a
No related branches found
No related tags found
No related merge requests found
...@@ -34,11 +34,14 @@ wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebas ...@@ -34,11 +34,14 @@ wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebas
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.fasta.gz wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.fasta.gz
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.miss.gz wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_DNA.miss.gz
wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_additional.fasta.gz wget -q ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/reference_proteomes/Eukaryota/UP000005640_9606_additional.fasta.gz
wget -q ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz
python process.py \ python process.py \
UP000005640_9606.fasta.gz UP000005640_9606_additional.fasta.gz \ UP000005640_9606.fasta.gz UP000005640_9606_additional.fasta.gz \
--out-csv uniprot_proteins.csv --out-index uniprot_proteins.fm --id-mapping UP000005640_9606.idmapping.gz \
--ensembl-gtf Homo_sapiens.GRCh38.98.gtf.gz \
--out-csv uniprot_proteins.csv \
--out-index uniprot_proteins.fm
ls -lh uniprot_proteins.csv uniprot_proteins.fm ls -lh uniprot_proteins.csv uniprot_proteins.fm
......
...@@ -6,6 +6,8 @@ import os ...@@ -6,6 +6,8 @@ import os
import gzip import gzip
import pandas import pandas
import gtfparse
import shellinford import shellinford
from Bio import SeqIO from Bio import SeqIO
...@@ -25,7 +27,16 @@ parser.add_argument( ...@@ -25,7 +27,16 @@ parser.add_argument(
required=True, required=True,
metavar="FILE.fm", metavar="FILE.fm",
help="Index output") help="Index output")
parser.add_argument(
"--id-mapping",
required=True,
metavar="FILE.idmapping.gz",
help="Uniprot mapping file")
parser.add_argument(
"--ensembl-gtf",
required=True,
metavar="FILE.gtf.gz",
help="Ensembl GTF file")
def run(): def run():
args = parser.parse_args(sys.argv[1:]) args = parser.parse_args(sys.argv[1:])
...@@ -45,14 +56,54 @@ def run(): ...@@ -45,14 +56,54 @@ def run():
print("Done reading fastas") print("Done reading fastas")
print(df) print(df)
pieces = df.name.str.split("|")
df["db"] = pieces.str.get(0)
df["accession"] = pieces.str.get(1)
df["entry"] = pieces.str.get(2)
print("Annotating using mapping", args.id_mapping)
mapping_df = pandas.read_csv(
args.id_mapping, sep="\t", header=None)
mapping_df.columns = ['accession', 'key', 'value']
for item in ["Ensembl", "Ensembl_TRS", "Gene_Name"]:
accession_to_values = mapping_df.loc[
mapping_df.key == item
].groupby("accession").value.unique().map(" ".join)
df[item.lower()] = df.accession.map(accession_to_values)
print("Annotating using gtf", args.ensembl_gtf)
gtf_df = gtfparse.read_gtf(args.ensembl_gtf)
matching_ensembl_genes = set(gtf_df.gene_id.unique())
ensembl_primary = []
for ensembls in df.ensembl.fillna("").str.split():
result = ""
for item in ensembls:
if item in matching_ensembl_genes:
result = item
break
ensembl_primary.append(result)
df["ensembl_primary"] = ensembl_primary
print("Fraction of records with matching ensembl genes", (
df.ensembl_primary != "").mean())
gene_records = gtf_df.loc[gtf_df.feature == "gene"].set_index("gene_id")
df["primary_ensembl_contig"] = df.ensembl_primary.map(gene_records.seqname)
df["primary_ensembl_start"] = df.ensembl_primary.map(gene_records.start)
df["primary_ensembl_end"] = df.ensembl_primary.map(gene_records.end)
df["primary_ensembl_strand"] = df.ensembl_primary.map(gene_records.strand)
print("Done annotating")
print(df)
df.to_csv(args.out_csv, index=True)
print("Wrote: ", os.path.abspath((args.out_csv)))
print("Building index") print("Building index")
fm.build() fm.build()
fm.write(args.out_index) fm.write(args.out_index)
print("Wrote: ", os.path.abspath((args.out_index))) print("Wrote: ", os.path.abspath((args.out_index)))
df.to_csv(args.out_csv, index=True)
print("Wrote: ", os.path.abspath((args.out_csv)))
if __name__ == '__main__': if __name__ == '__main__':
run() run()
shellinford shellinford
biopython biopython
gtfparse
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment