Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add UCSC TxDb #66

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
241 changes: 241 additions & 0 deletions genomicFeatures_test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a699ddc4-502f-418e-9f26-99677ad07cff",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"The downloaded binary packages are in\n",
"\t/var/folders/zs/gjblv2b16g3b50jqcq6fw76m0000gq/T//RtmpDxAFv6/downloaded_packages\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"'getOption(\"repos\")' replaces Bioconductor standard repositories, see\n",
"'help(\"repositories\", package = \"BiocManager\")' for details.\n",
"Replacement repositories:\n",
" CRAN: https://cran.r-project.org\n",
"\n",
"Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.3 (2024-02-29)\n",
"\n",
"Installing package(s) 'BiocVersion'\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"The downloaded binary packages are in\n",
"\t/var/folders/zs/gjblv2b16g3b50jqcq6fw76m0000gq/T//RtmpDxAFv6/downloaded_packages\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Old packages: 'boot', 'codetools', 'lattice'\n",
"\n"
]
}
],
"source": [
"if (!require(\"BiocManager\", quietly = TRUE))\n",
" install.packages(\"BiocManager\")\n",
"BiocManager::install(version = \"3.18\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "a54fc80d-569d-409d-8163-1ff4215a6a7c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"'getOption(\"repos\")' replaces Bioconductor standard repositories, see\n",
"'help(\"repositories\", package = \"BiocManager\")' for details.\n",
"Replacement repositories:\n",
" CRAN: https://cran.r-project.org\n",
"\n",
"Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.3 (2024-02-29)\n",
"\n",
"Installing package(s) 'TxDb.Hsapiens.UCSC.hg38.knownGene'\n",
"\n",
"installing the source package ‘TxDb.Hsapiens.UCSC.hg38.knownGene’\n",
"\n",
"\n",
"Old packages: 'boot', 'codetools', 'lattice'\n",
"\n"
]
}
],
"source": [
"# BiocManager::install(\"GenomicFeatures\")\n",
"BiocManager::install(\"TxDb.Hsapiens.UCSC.hg38.knownGene\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9f2ca2f7-6c28-4a70-ace8-3015e7fbf0b1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TxDb object:\n",
"# Db type: TxDb\n",
"# Supporting package: GenomicFeatures\n",
"# Data source: UCSC\n",
"# Genome: hg38\n",
"# Organism: Homo sapiens\n",
"# Taxonomy ID: 9606\n",
"# UCSC Table: knownGene\n",
"# UCSC Track: GENCODE V44\n",
"# Resource URL: http://genome.ucsc.edu/\n",
"# Type of Gene ID: Entrez Gene ID\n",
"# Full dataset: yes\n",
"# miRBase build ID: NA\n",
"# Nb of transcripts: 276905\n",
"# Db created by: GenomicFeatures package from Bioconductor\n",
"# Creation time: 2023-09-20 17:25:17 +0000 (Wed, 20 Sep 2023)\n",
"# GenomicFeatures version at creation time: 1.53.2\n",
"# RSQLite version at creation time: 2.3.1\n",
"# DBSCHEMAVERSION: 1.2"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"library(TxDb.Hsapiens.UCSC.hg38.knownGene)\n",
"txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene\n",
"txdb"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "2ded4ebf-2390-4261-93ee-464bcb7d58db",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 2135 genes were dropped because they have exons located on both strands\n",
" of the same reference sequence or on more than one reference sequence,\n",
" so cannot be represented by a single genomic range.\n",
" Use 'single.strand.genes.only=FALSE' to get all the genes in a\n",
" GRangesList object, or use suppressMessages() to suppress this message.\n",
"\n"
]
},
{
"data": {
"text/plain": [
"GRanges object with 30733 ranges and 1 metadata column:\n",
" seqnames ranges strand | gene_id\n",
" <Rle> <IRanges> <Rle> | <character>\n",
" 1 chr19 58345178-58362751 - | 1\n",
" 10 chr8 18386311-18401218 + | 10\n",
" 100 chr20 44584896-44652252 - | 100\n",
" 1000 chr18 27932879-28177946 - | 1000\n",
" 100008586 chrX 49551278-49568218 + | 100008586\n",
" ... ... ... ... . ...\n",
" 9990 chr15 34229784-34338060 - | 9990\n",
" 9991 chr9 112217716-112333664 - | 9991\n",
" 9992 chr21 34364006-34371381 + | 9992\n",
" 9993 chr22 19036282-19122454 - | 9993\n",
" 9997 chr22 50523568-50526461 - | 9997\n",
" -------\n",
" seqinfo: 711 sequences (1 circular) from hg38 genome"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"genes(txdb)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "41f123b2-6f97-4f33-b54f-46c07603a432",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 2135 genes were dropped because they have exons located on both strands\n",
" of the same reference sequence or on more than one reference sequence,\n",
" so cannot be represented by a single genomic range.\n",
" Use 'single.strand.genes.only=FALSE' to get all the genes in a\n",
" GRangesList object, or use suppressMessages() to suppress this message.\n",
"\n"
]
},
{
"data": {
"text/plain": [
"GRanges object with 30733 ranges and 1 metadata column:\n",
" seqnames ranges strand | gene_id\n",
" <Rle> <IRanges> <Rle> | <character>\n",
" 1 chr19 58345178-58362751 - | 1\n",
" 10 chr8 18386311-18401218 + | 10\n",
" 100 chr20 44584896-44652252 - | 100\n",
" 1000 chr18 27932879-28177946 - | 1000\n",
" 100008586 chrX 49551278-49568218 + | 100008586\n",
" ... ... ... ... . ...\n",
" 9990 chr15 34229784-34338060 - | 9990\n",
" 9991 chr9 112217716-112333664 - | 9991\n",
" 9992 chr21 34364006-34371381 + | 9992\n",
" 9993 chr22 19036282-19122454 - | 9993\n",
" 9997 chr22 50523568-50526461 - | 9997\n",
" -------\n",
" seqinfo: 711 sequences (1 circular) from hg38 genome"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"genes(txdb)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "R",
"language": "R",
"name": "ir"
},
"language_info": {
"codemirror_mode": "r",
"file_extension": ".r",
"mimetype": "text/x-r-source",
"name": "R",
"pygments_lexer": "r",
"version": "4.3.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
4 changes: 2 additions & 2 deletions src/genomic_features/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from importlib.metadata import version

from . import ensembl, filters
from . import ensembl, filters, ucsc

__all__ = ["ensembl"]
__all__ = ["ensembl", "ucsc"]

__version__ = version("genomic-features")
1 change: 1 addition & 0 deletions src/genomic_features/ucsc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .ucscdb import UCSCDB, annotation, list_ucscdb_annotations
Loading
Loading