From ca0c28f3b1bd654b95343f7ba580e40a8386af6f Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Wed, 3 Apr 2024 17:54:09 -0400 Subject: [PATCH 1/4] initial UCSC commit --- src/genomic_features/__init__.py | 4 +- src/genomic_features/ucsc/__init__.py | 1 + src/genomic_features/ucsc/ucscdb.py | 373 ++++++++++++++++++++++++ ucscdb.ipynb | 394 ++++++++++++++++++++++++++ 4 files changed, 770 insertions(+), 2 deletions(-) create mode 100644 src/genomic_features/ucsc/__init__.py create mode 100644 src/genomic_features/ucsc/ucscdb.py create mode 100644 ucscdb.ipynb diff --git a/src/genomic_features/__init__.py b/src/genomic_features/__init__.py index 679dc43..b5da4de 100644 --- a/src/genomic_features/__init__.py +++ b/src/genomic_features/__init__.py @@ -1,7 +1,7 @@ from importlib.metadata import version -from . import ensembl, filters +from . import ensembl, filters, ucsc -__all__ = ["ensembl"] +__all__ = ["ensembl", "ucsc"] __version__ = version("genomic-features") diff --git a/src/genomic_features/ucsc/__init__.py b/src/genomic_features/ucsc/__init__.py new file mode 100644 index 0000000..a363549 --- /dev/null +++ b/src/genomic_features/ucsc/__init__.py @@ -0,0 +1 @@ +from .ucscdb import UCSCDB, annotation, list_ucscdb_annotations diff --git a/src/genomic_features/ucsc/ucscdb.py b/src/genomic_features/ucsc/ucscdb.py new file mode 100644 index 0000000..27a0d5d --- /dev/null +++ b/src/genomic_features/ucsc/ucscdb.py @@ -0,0 +1,373 @@ +from __future__ import annotations + +import warnings +from functools import cached_property +from itertools import product +import os +from pathlib import Path +from typing import Final, Literal + +import ibis +import requests +from ibis import deferred +from ibis.expr.types import Table as IbisTable +from pandas import DataFrame, Timestamp +from requests.exceptions import HTTPError + +from genomic_features import filters +from genomic_features._core import filters as _filters +from genomic_features._core.cache import retrieve_annotation + +PKG_CACHE_DIR = "genomic-features" + +BIOC_ANNOTATION_HUB_URL = ( + "https://bioconductorhubs.blob.core.windows.net/annotationhub/" +) +ANNOTATION_HUB_URL = ( + "https://annotationhub.bioconductor.org/metadata/annotationhub.sqlite3" +) +TIMESTAMP_URL = "https://annotationhub.bioconductor.org/metadata/database_timestamp" + + +def annotation(species: str, bioc_version: str, assembly: str, + ucsc_table: str) -> UCSCDB: + try: + ucscdb = UCSCDB( + ibis.sqlite.connect( + retrieve_annotation(os.path.join( + BIOC_ANNOTATION_HUB_URL, + f"ucsc/standard/{bioc_version}/TxDb.{species}.UCSC.{assembly}.{ucsc_table}.sqlite" + )) + ) + ) + except HTTPError as err: + if err.response.status_code == 404: + raise ValueError( + f"No ucsc TxDb database found for {species} {bioc_version} {assembly} {ucsc_table}. Check available versions with `genomic_features.ucsc.list_ucscdb_annotation`." + ) from err + else: + raise HTTPError from err + return ucscdb + + +def list_ucscdb_annotations(species: None | str | list[str] = None) -> DataFrame: + """List available Ensembl gene annotations. + + Parameters + ---------- + species + Show gene annotations for subset of species E.g. Hsapiens for human, Mmusculus + for mouse (optional) + + Returns + ------- + A table of available species and annotation versions in EnsDb. + + + Usage + ----- + >>> gf.ensembl.list_ensdb_annotations("Mmusculus") + """ + _COL_ORDERS = ['species', 'assembly', 'ucsc_table', 'bioc_version'] + # Get latest AnnotationHub timestamp + db_path = Path(retrieve_annotation(ANNOTATION_HUB_URL)) + timestamp = requests.get(TIMESTAMP_URL).text + ahdb = ibis.sqlite.connect(db_path) + latest_ts = Timestamp(timestamp).replace(tzinfo=None) + cached_ts = ahdb.table("timestamp").execute()["timestamp"][0] + if latest_ts != cached_ts: + db_path.unlink() + ahdb = ibis.sqlite.connect(retrieve_annotation(ANNOTATION_HUB_URL)) + + version_table = ( + ahdb.table("rdatapaths").filter(deferred.rdataclass == "TxDb").execute() + ) + version_table = version_table[version_table['rdatapath'].map(lambda x: x.split('/')[0] == 'ucsc')] + + version_table["bioc_version"] = ( + version_table["rdatapath"] + .str.split("/", expand=True)[2] + ) + version_table["species"] = ( + version_table["rdatapath"] + .str.split("/", expand=True)[3] + .str.split(".", expand=True)[1] + ) + version_table["assembly"] = ( + version_table["rdatapath"] + .str.split("/", expand=True)[3] + .str.split(".", expand=True)[3] + ) + version_table["ucsc_table"] = ( + version_table["rdatapath"] + .str.split("/", expand=True)[3] + .str.split(".", expand=True)[4] + ) + # `Athaliana` do not follow the normal name formatting, drop them. + version_table = version_table[version_table['ucsc_table'] != 'sqlite'] + + if species is not None: + if isinstance(species, str): + version_table = version_table[version_table["species"] == species] + else: + version_table = version_table[version_table["species"].isin(species)] + # check that species exist + if version_table.shape[0] == 0: + raise ValueError( + f'No ucsc database found for {species}. Must be in {" ".join(df["species"].unique())}.' + ) + + return version_table[_COL_ORDERS].sort_values(_COL_ORDERS) + + +class UCSCDB: + """UCSC annotation database.""" + + def __init__(self, connection: ibis.BaseBackend): + self.db = connection + + @cached_property + def metadata(self) -> dict: + metadata_tbl = self.db.table("metadata").execute() + return dict(zip(metadata_tbl["name"], metadata_tbl["value"])) + + def __repr__(self) -> str: + d = self.metadata + return f"UCSCDB(organism='{d['Organism']}', ucsc_track='{d['UCSC Track']}', genome='{d['Genome']}', ucsc_table='{d['UCSC Table']}')" + + # TODO(gamazeps): should we add some info on that ? UCSC just has tx_id + def genes( + self, + cols: list[str] | None = None, + filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), + join_type: Literal["inner", "left"] = "inner", + ) -> DataFrame: + table: Final = "gene" + if cols is None: + cols = self.list_columns(table) # get all columns + + cols = cols.copy() + if "gene_id" not in cols: # genes always needs gene_id + cols.append("gene_id") + + query = self._build_query(table, cols, filter, join_type) + return self._execute_query(query) + + def transcripts( + self, + cols: list[str] | None = None, + filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), + join_type: Literal["inner", "left"] = "inner", + ) -> DataFrame: + table: Final = "transcript" + if cols is None: + cols = self.list_columns(table) # get all columns + + cols = cols.copy() + # Require primary key in output + if "_tx_id" not in cols: + cols.append("_tx_id") + # seq_name is required for genomic range operations + if ("tx_start" in cols or "tx_end" in cols) and "tx_chrome" not in cols: + cols.append("tx_chrom") + + query = self._build_query(table, cols, filter, join_type) + return self._execute_query(query) + + def exons( + self, + cols: list[str] | None = None, + filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), + join_type: Literal["inner", "left"] = "inner", + ) -> DataFrame: + table: Final = "exon" + if cols is None: + cols = self.list_columns(table) # get all columns + + cols = cols.copy() + # Require primary key in output + if "_exon_id" not in cols: + cols.append("_exon_id") + # seq_name is required for genomic range operations + if ( + "exon_start" in cols or "exon_end" in cols + ) and "exon_chrom" not in cols: + cols.append("exon_chrom") + + query = self._build_query(table, cols, filter, join_type) + return self._execute_query(query) + + def _execute_query(self, query: IbisTable) -> DataFrame: + # TODO: Allow more options for returning results + return query.distinct().execute() + + def chrominfo(self) -> DataFrame: + return self.db.table("chrominfo").execute() + + def list_tables(self) -> list: + return self.db.list_tables() + + def _tables_by_degree(self, tab: list[str] = None) -> list: + if tab is None: + tab = self.list_tables() # list of table names + # check that all tables are in the database and print warning + if not set(tab).issubset(set(self.list_tables())): + missing_tables = ", ".join(set(tab) - set(self.list_tables())) + warnings.warn( + f"The following tables are not in the database: {missing_tables}.", + UserWarning, + stacklevel=2, + ) + + tab = list(set(tab) & set(self.list_tables())) # remove tables not in db + + # order tables + + table_order = { + "transcript": 1, + "cds": 2, + "gene": 3, + "splicing": 4, + "exon": 5, + "chrominfo": 6, + "metadata": 99, + } + + return sorted(tab, key=lambda x: table_order[x]) + + def list_columns(self, tables: str | list[str] | None = None) -> list[str]: + if tables is None: + tables = self.db.list_tables() # list of table names + elif isinstance(tables, str): + tables = [tables] # list of tables names (only one) + columns = [c for t in tables for c in self.db.table(t).columns] + return columns + + def _clean_columns(self, columns: list[str]) -> list[str]: + if isinstance(columns, str): + columns = [columns] + + valid_columns = set(self.list_columns()) + cols = list(filter(lambda c: c in valid_columns, columns)) + invalid_columns = set(columns) - valid_columns + if invalid_columns: + raise ValueError( + f"The following columns are not found in any database: {invalid_columns}" + ) + if not cols: + raise ValueError("No valid columns were found.") + return cols + + def _tables_for_columns(self, cols: list, start_with: str | None = None) -> list: + cols = self._clean_columns(cols) + table_list = self._tables_by_degree() # list of table names + + # remove start_with from table_list and add it to the beginning of the list + if start_with is not None: + # check if start_with is a valid table + if start_with not in table_list: + raise ValueError(f"Invalid table: {start_with}") + # remove start_with from table_list and add it to the beginning of the list + table_list.remove(start_with) + table_list = [start_with] + table_list + + tables = [] + for t in table_list: + # check if all columns are in one table + if set(cols).issubset(self.db.table(t).columns): + tables.append(t) + return tables + else: + # check if a single column is in the table + for c in cols.copy(): + if c in self.db.table(t).columns: + if t not in tables: + tables.append(t) + cols.remove(c) # remove column from list + return tables + + def _build_query( + self, + table: Literal["gene", "tx", "exon"], + cols: list[str], + filter: _filters.AbstractFilterExpr, + join_type: Literal["inner", "left"] = "inner", + ) -> IbisTable: + """Build a query for the genomic features table.""" + # Finalize cols + self._clean_columns(cols) + for col in filter.columns(): + if col not in cols: + cols.append(col) + + # check if join is required + tables = self._get_required_tables(self._tables_for_columns(cols)) + + # Basically just to make sure exons stay in the query + if table not in tables: + tables.append(table) + + if len(tables) > 1: + query = self._join_query(tables, start_with=table, join_type=join_type) + else: + query = self.db.table(table) + # add filter + query = query.filter(filter.convert()).select(cols) + return query + + def _join_query( + self, + tables: list[str], + start_with: str, + join_type: Literal["inner", "left"] = "inner", + ) -> IbisTable: + """Join tables and return a query.""" + # check for intermediate tables + JOIN_TABLE = [ + (("gene", "tx"), "gene_id"), + (("gene", "chromosome"), "seq_name"), + (("tx", "tx2exon"), "tx_id"), + (("tx2exon", "exon"), "exon_id"), + (("tx", "protein"), "tx_id"), + (("gene", "entrezgene"), "gene_id"), + (("protein", "protein_domain"), "protein_id"), + (("protein", "uniprot"), "protein_id"), + (("uniprot", "protein_domain"), "protein_id"), + ] + tables = tables.copy() + tables.remove(start_with) + db = self.db + current_tables = [start_with] + query = db.table(start_with) + + while len(tables) > 0: + for (table_names, key), t1_name, t2_name in product( # noqa: B007 + JOIN_TABLE, current_tables, tables + ): + if t1_name in table_names and t2_name in table_names: + break + else: + raise ValueError( + f"Failed to find match for tables: {current_tables} and {tables}" + ) + + current_tables.append(t2_name) + tables.remove(t2_name) + + t2 = db.table(t2_name) + if join_type == "inner": + query = query.join(t2, predicates=[key], how="inner") + elif join_type == "left": + query = query.join( + t2, + predicates=[key], + how="left", + rname="{name}_y", + # suffixes=("", "_y"), + ) + query = query.drop(f"{key}_y") # drop duplicate columns + else: + raise ValueError(f"Invalid join type: {join_type}") + + return query + diff --git a/ucscdb.ipynb b/ucscdb.ipynb new file mode 100644 index 0000000..8074e2e --- /dev/null +++ b/ucscdb.ipynb @@ -0,0 +1,394 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d1cbfd54-ed0a-475a-9ab4-d990ead4fa21", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/felixraimundo/Library/Application Support/hatch/env/virtual/genomic-features/KcSbK2dP/genomic-features/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import ibis\n", + "import genomic_features as gf" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6c41a912-e665-44a0-8df5-a670db50973b", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "4aea998e-7628-4dc4-b0e6-523a7eaf9783", + "metadata": {}, + "outputs": [], + "source": [ + "ensdb = gf.ensembl.annotation(species=\"Hsapiens\", version=\"108\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "1d1b2fb5-cbbd-49dd-8fc4-d64ab3cb5354", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "EnsemblDB(organism='Homo sapiens', ensembl_release='108')" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ensdb" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "463c367d-0586-47e4-852d-7e5f15df87f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Db type': 'EnsDb',\n", + " 'Type of Gene ID': 'Ensembl Gene ID',\n", + " 'Supporting package': 'ensembldb',\n", + " 'Db created by': 'ensembldb package from Bioconductor',\n", + " 'script_version': '0.3.7',\n", + " 'Creation time': 'Fri Oct 28 05:24:43 2022',\n", + " 'ensembl_version': '108',\n", + " 'ensembl_host': 'localhost',\n", + " 'Organism': 'Homo sapiens',\n", + " 'taxonomy_id': '9606',\n", + " 'genome_build': 'GRCh38',\n", + " 'DBSCHEMAVERSION': '2.2'}" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ensdb.metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "828d4119-b66c-460e-8233-0ca84e2c8d17", + "metadata": {}, + "outputs": [], + "source": [ + "ucscdb = gf.ucsc.annotation(species='Hsapiens', assembly='hg38', ucsc_table='knownGene', bioc_version='3.18', )" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "ed47b055-a237-46c8-a69c-cfc287459713", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "UCSCDB(organism='Homo sapiens', ucsc_track='GENCODE V44', genome='hg38, ucsc_table='knownGene'')" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ucscdb" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "9be1fc5f-ca92-419e-b60c-3780b8d0d710", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namevalue
0Db typeTxDb
1Supporting packageGenomicFeatures
2Data sourceUCSC
3Genomehg38
4OrganismHomo sapiens
5Taxonomy ID9606
6UCSC TableknownGene
7UCSC TrackGENCODE V44
8Resource URLhttp://genome.ucsc.edu/
9Type of Gene IDEntrez Gene ID
10Full datasetyes
11miRBase build IDNone
12Nb of transcripts276905
13Db created byGenomicFeatures package from Bioconductor
14Creation time2023-09-20 17:25:17 +0000 (Wed, 20 Sep 2023)
15GenomicFeatures version at creation time1.53.2
16RSQLite version at creation time2.3.1
17DBSCHEMAVERSION1.2
\n", + "
" + ], + "text/plain": [ + " name \\\n", + "0 Db type \n", + "1 Supporting package \n", + "2 Data source \n", + "3 Genome \n", + "4 Organism \n", + "5 Taxonomy ID \n", + "6 UCSC Table \n", + "7 UCSC Track \n", + "8 Resource URL \n", + "9 Type of Gene ID \n", + "10 Full dataset \n", + "11 miRBase build ID \n", + "12 Nb of transcripts \n", + "13 Db created by \n", + "14 Creation time \n", + "15 GenomicFeatures version at creation time \n", + "16 RSQLite version at creation time \n", + "17 DBSCHEMAVERSION \n", + "\n", + " value \n", + "0 TxDb \n", + "1 GenomicFeatures \n", + "2 UCSC \n", + "3 hg38 \n", + "4 Homo sapiens \n", + "5 9606 \n", + "6 knownGene \n", + "7 GENCODE V44 \n", + "8 http://genome.ucsc.edu/ \n", + "9 Entrez Gene ID \n", + "10 yes \n", + "11 None \n", + "12 276905 \n", + "13 GenomicFeatures package from Bioconductor \n", + "14 2023-09-20 17:25:17 +0000 (Wed, 20 Sep 2023) \n", + "15 1.53.2 \n", + "16 2.3.1 \n", + "17 1.2 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ucscdb.table('metadata').execute()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "3dcebadb-95f2-4216-ab67-fb86c155e573", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Athaliana', 'Btaurus', 'Celegans', 'Cfamiliaris', 'Dmelanogaster',\n", + " 'Drerio', 'Ggallus', 'Hsapiens', 'Mmulatta', 'Mmusculus',\n", + " 'Ptroglodytes', 'Rnorvegicus', 'Scerevisiae', 'Sscrofa'],\n", + " dtype=object)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gf.ucsc.list_ucscdb_annotations()['species'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "3de1b23f-d6b7-4d31-86f3-e16a40d7f9ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Btaurus Celegans Cfamiliaris Dmelanogaster Drerio Ggallus Hsapiens Mmulatta Mmusculus Ptroglodytes Rnorvegicus Scerevisiae Sscrofa'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = gf.ucsc.list_ucscdb_annotations()\n", + "' '.join(df['species'].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "1d418c83-c248-404e-a91c-2adf73b4cc99", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['cds', 'chrominfo', 'exon', 'gene', 'metadata', 'splicing', 'transcript']" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ucscdb.list_tables()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 140cf1fc0860abf429a1ceba88ab013c3ea991a9 Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Wed, 3 Apr 2024 21:27:13 -0400 Subject: [PATCH 2/4] WIP --- ucscdb.ipynb | 251 --------------------------------------------------- 1 file changed, 251 deletions(-) diff --git a/ucscdb.ipynb b/ucscdb.ipynb index 8074e2e..f2bd454 100644 --- a/ucscdb.ipynb +++ b/ucscdb.ipynb @@ -41,59 +41,6 @@ "ensdb = gf.ensembl.annotation(species=\"Hsapiens\", version=\"108\")" ] }, - { - "cell_type": "code", - "execution_count": 32, - "id": "1d1b2fb5-cbbd-49dd-8fc4-d64ab3cb5354", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "EnsemblDB(organism='Homo sapiens', ensembl_release='108')" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ensdb" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "463c367d-0586-47e4-852d-7e5f15df87f5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'Db type': 'EnsDb',\n", - " 'Type of Gene ID': 'Ensembl Gene ID',\n", - " 'Supporting package': 'ensembldb',\n", - " 'Db created by': 'ensembldb package from Bioconductor',\n", - " 'script_version': '0.3.7',\n", - " 'Creation time': 'Fri Oct 28 05:24:43 2022',\n", - " 'ensembl_version': '108',\n", - " 'ensembl_host': 'localhost',\n", - " 'Organism': 'Homo sapiens',\n", - " 'taxonomy_id': '9606',\n", - " 'genome_build': 'GRCh38',\n", - " 'DBSCHEMAVERSION': '2.2'}" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ensdb.metadata" - ] - }, { "cell_type": "code", "execution_count": 39, @@ -104,204 +51,6 @@ "ucscdb = gf.ucsc.annotation(species='Hsapiens', assembly='hg38', ucsc_table='knownGene', bioc_version='3.18', )" ] }, - { - "cell_type": "code", - "execution_count": 44, - "id": "ed47b055-a237-46c8-a69c-cfc287459713", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "UCSCDB(organism='Homo sapiens', ucsc_track='GENCODE V44', genome='hg38, ucsc_table='knownGene'')" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ucscdb" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "9be1fc5f-ca92-419e-b60c-3780b8d0d710", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namevalue
0Db typeTxDb
1Supporting packageGenomicFeatures
2Data sourceUCSC
3Genomehg38
4OrganismHomo sapiens
5Taxonomy ID9606
6UCSC TableknownGene
7UCSC TrackGENCODE V44
8Resource URLhttp://genome.ucsc.edu/
9Type of Gene IDEntrez Gene ID
10Full datasetyes
11miRBase build IDNone
12Nb of transcripts276905
13Db created byGenomicFeatures package from Bioconductor
14Creation time2023-09-20 17:25:17 +0000 (Wed, 20 Sep 2023)
15GenomicFeatures version at creation time1.53.2
16RSQLite version at creation time2.3.1
17DBSCHEMAVERSION1.2
\n", - "
" - ], - "text/plain": [ - " name \\\n", - "0 Db type \n", - "1 Supporting package \n", - "2 Data source \n", - "3 Genome \n", - "4 Organism \n", - "5 Taxonomy ID \n", - "6 UCSC Table \n", - "7 UCSC Track \n", - "8 Resource URL \n", - "9 Type of Gene ID \n", - "10 Full dataset \n", - "11 miRBase build ID \n", - "12 Nb of transcripts \n", - "13 Db created by \n", - "14 Creation time \n", - "15 GenomicFeatures version at creation time \n", - "16 RSQLite version at creation time \n", - "17 DBSCHEMAVERSION \n", - "\n", - " value \n", - "0 TxDb \n", - "1 GenomicFeatures \n", - "2 UCSC \n", - "3 hg38 \n", - "4 Homo sapiens \n", - "5 9606 \n", - "6 knownGene \n", - "7 GENCODE V44 \n", - "8 http://genome.ucsc.edu/ \n", - "9 Entrez Gene ID \n", - "10 yes \n", - "11 None \n", - "12 276905 \n", - "13 GenomicFeatures package from Bioconductor \n", - "14 2023-09-20 17:25:17 +0000 (Wed, 20 Sep 2023) \n", - "15 1.53.2 \n", - "16 2.3.1 \n", - "17 1.2 " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ucscdb.table('metadata').execute()" - ] - }, { "cell_type": "code", "execution_count": 19, From 92ffcc6c3a6d329c20b375ee7459fb50b5562299 Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Fri, 26 Apr 2024 23:51:54 -0400 Subject: [PATCH 3/4] wip --- genomicFeatures_test.ipynb | 241 +++++++ src/genomic_features/ucsc/ucscdb.py | 211 ++---- ucscdb.ipynb | 967 +++++++++++++++++++++++++++- 3 files changed, 1221 insertions(+), 198 deletions(-) create mode 100644 genomicFeatures_test.ipynb diff --git a/genomicFeatures_test.ipynb b/genomicFeatures_test.ipynb new file mode 100644 index 0000000..8b8263d --- /dev/null +++ b/genomicFeatures_test.ipynb @@ -0,0 +1,241 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a699ddc4-502f-418e-9f26-99677ad07cff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "The downloaded binary packages are in\n", + "\t/var/folders/zs/gjblv2b16g3b50jqcq6fw76m0000gq/T//RtmpDxAFv6/downloaded_packages\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "'getOption(\"repos\")' replaces Bioconductor standard repositories, see\n", + "'help(\"repositories\", package = \"BiocManager\")' for details.\n", + "Replacement repositories:\n", + " CRAN: https://cran.r-project.org\n", + "\n", + "Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.3 (2024-02-29)\n", + "\n", + "Installing package(s) 'BiocVersion'\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "The downloaded binary packages are in\n", + "\t/var/folders/zs/gjblv2b16g3b50jqcq6fw76m0000gq/T//RtmpDxAFv6/downloaded_packages\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Old packages: 'boot', 'codetools', 'lattice'\n", + "\n" + ] + } + ], + "source": [ + "if (!require(\"BiocManager\", quietly = TRUE))\n", + " install.packages(\"BiocManager\")\n", + "BiocManager::install(version = \"3.18\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a54fc80d-569d-409d-8163-1ff4215a6a7c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "'getOption(\"repos\")' replaces Bioconductor standard repositories, see\n", + "'help(\"repositories\", package = \"BiocManager\")' for details.\n", + "Replacement repositories:\n", + " CRAN: https://cran.r-project.org\n", + "\n", + "Bioconductor version 3.18 (BiocManager 1.30.22), R 4.3.3 (2024-02-29)\n", + "\n", + "Installing package(s) 'TxDb.Hsapiens.UCSC.hg38.knownGene'\n", + "\n", + "installing the source package ‘TxDb.Hsapiens.UCSC.hg38.knownGene’\n", + "\n", + "\n", + "Old packages: 'boot', 'codetools', 'lattice'\n", + "\n" + ] + } + ], + "source": [ + "# BiocManager::install(\"GenomicFeatures\")\n", + "BiocManager::install(\"TxDb.Hsapiens.UCSC.hg38.knownGene\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9f2ca2f7-6c28-4a70-ace8-3015e7fbf0b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TxDb object:\n", + "# Db type: TxDb\n", + "# Supporting package: GenomicFeatures\n", + "# Data source: UCSC\n", + "# Genome: hg38\n", + "# Organism: Homo sapiens\n", + "# Taxonomy ID: 9606\n", + "# UCSC Table: knownGene\n", + "# UCSC Track: GENCODE V44\n", + "# Resource URL: http://genome.ucsc.edu/\n", + "# Type of Gene ID: Entrez Gene ID\n", + "# Full dataset: yes\n", + "# miRBase build ID: NA\n", + "# Nb of transcripts: 276905\n", + "# Db created by: GenomicFeatures package from Bioconductor\n", + "# Creation time: 2023-09-20 17:25:17 +0000 (Wed, 20 Sep 2023)\n", + "# GenomicFeatures version at creation time: 1.53.2\n", + "# RSQLite version at creation time: 2.3.1\n", + "# DBSCHEMAVERSION: 1.2" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "library(TxDb.Hsapiens.UCSC.hg38.knownGene)\n", + "txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene\n", + "txdb" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2ded4ebf-2390-4261-93ee-464bcb7d58db", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 2135 genes were dropped because they have exons located on both strands\n", + " of the same reference sequence or on more than one reference sequence,\n", + " so cannot be represented by a single genomic range.\n", + " Use 'single.strand.genes.only=FALSE' to get all the genes in a\n", + " GRangesList object, or use suppressMessages() to suppress this message.\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "GRanges object with 30733 ranges and 1 metadata column:\n", + " seqnames ranges strand | gene_id\n", + " | \n", + " 1 chr19 58345178-58362751 - | 1\n", + " 10 chr8 18386311-18401218 + | 10\n", + " 100 chr20 44584896-44652252 - | 100\n", + " 1000 chr18 27932879-28177946 - | 1000\n", + " 100008586 chrX 49551278-49568218 + | 100008586\n", + " ... ... ... ... . ...\n", + " 9990 chr15 34229784-34338060 - | 9990\n", + " 9991 chr9 112217716-112333664 - | 9991\n", + " 9992 chr21 34364006-34371381 + | 9992\n", + " 9993 chr22 19036282-19122454 - | 9993\n", + " 9997 chr22 50523568-50526461 - | 9997\n", + " -------\n", + " seqinfo: 711 sequences (1 circular) from hg38 genome" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "genes(txdb)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "41f123b2-6f97-4f33-b54f-46c07603a432", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 2135 genes were dropped because they have exons located on both strands\n", + " of the same reference sequence or on more than one reference sequence,\n", + " so cannot be represented by a single genomic range.\n", + " Use 'single.strand.genes.only=FALSE' to get all the genes in a\n", + " GRangesList object, or use suppressMessages() to suppress this message.\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "GRanges object with 30733 ranges and 1 metadata column:\n", + " seqnames ranges strand | gene_id\n", + " | \n", + " 1 chr19 58345178-58362751 - | 1\n", + " 10 chr8 18386311-18401218 + | 10\n", + " 100 chr20 44584896-44652252 - | 100\n", + " 1000 chr18 27932879-28177946 - | 1000\n", + " 100008586 chrX 49551278-49568218 + | 100008586\n", + " ... ... ... ... . ...\n", + " 9990 chr15 34229784-34338060 - | 9990\n", + " 9991 chr9 112217716-112333664 - | 9991\n", + " 9992 chr21 34364006-34371381 + | 9992\n", + " 9993 chr22 19036282-19122454 - | 9993\n", + " 9997 chr22 50523568-50526461 - | 9997\n", + " -------\n", + " seqinfo: 711 sequences (1 circular) from hg38 genome" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "genes(txdb)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/genomic_features/ucsc/ucscdb.py b/src/genomic_features/ucsc/ucscdb.py index 27a0d5d..c7d3f3f 100644 --- a/src/genomic_features/ucsc/ucscdb.py +++ b/src/genomic_features/ucsc/ucscdb.py @@ -28,6 +28,22 @@ ) TIMESTAMP_URL = "https://annotationhub.bioconductor.org/metadata/database_timestamp" +_TX_TABLE = 'transcript' +_EXONS_TABLE = 'exon' +_GENES_TABLE = 'gene' + +_PRETTY_NAMES = { + '_tx_id': 'tx_id', + 'tx_chrom': 'chrom', + 'tx_strand': 'strand', + 'tx_start': 'start', + 'tx_end': 'end', + '_exon_id': 'exon_id', + 'exon_chrom': 'chrom', + 'exon_strand': 'strand', + 'exon_start': 'start', + 'exon_end': 'end', +} def annotation(species: str, bioc_version: str, assembly: str, ucsc_table: str) -> UCSCDB: @@ -135,106 +151,44 @@ def __repr__(self) -> str: d = self.metadata return f"UCSCDB(organism='{d['Organism']}', ucsc_track='{d['UCSC Track']}', genome='{d['Genome']}', ucsc_table='{d['UCSC Table']}')" - # TODO(gamazeps): should we add some info on that ? UCSC just has tx_id - def genes( - self, - cols: list[str] | None = None, - filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), - join_type: Literal["inner", "left"] = "inner", - ) -> DataFrame: - table: Final = "gene" - if cols is None: - cols = self.list_columns(table) # get all columns - - cols = cols.copy() - if "gene_id" not in cols: # genes always needs gene_id - cols.append("gene_id") + def chrominfo(self) -> DataFrame: + return self.db.table("chrominfo").execute() - query = self._build_query(table, cols, filter, join_type) - return self._execute_query(query) + def list_tables(self) -> list: + return self.db.list_tables() def transcripts( self, - cols: list[str] | None = None, - filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), - join_type: Literal["inner", "left"] = "inner", + #cols: list[str] | None = None, + #filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), ) -> DataFrame: - table: Final = "transcript" - if cols is None: - cols = self.list_columns(table) # get all columns - - cols = cols.copy() - # Require primary key in output - if "_tx_id" not in cols: - cols.append("_tx_id") - # seq_name is required for genomic range operations - if ("tx_start" in cols or "tx_end" in cols) and "tx_chrome" not in cols: - cols.append("tx_chrom") - - query = self._build_query(table, cols, filter, join_type) - return self._execute_query(query) + tx = self.db.table(_TX_TABLE).execute() + tx = tx.rename(columns=_PRETTY_NAMES) + tx = tx.drop('tx_type', axis=1) # always None + return tx def exons( self, - cols: list[str] | None = None, - filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), - join_type: Literal["inner", "left"] = "inner", + #cols: list[str] | None = None, + #filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), ) -> DataFrame: - table: Final = "exon" - if cols is None: - cols = self.list_columns(table) # get all columns - - cols = cols.copy() - # Require primary key in output - if "_exon_id" not in cols: - cols.append("_exon_id") - # seq_name is required for genomic range operations - if ( - "exon_start" in cols or "exon_end" in cols - ) and "exon_chrom" not in cols: - cols.append("exon_chrom") - - query = self._build_query(table, cols, filter, join_type) - return self._execute_query(query) + exons = self.db.table(_EXONS_TABLE).execute() + exons = exons.rename(columns=_PRETTY_NAMES) + exons = exons.drop('exon_name', axis=1) # always None + return exons + + def genes( + self, + #cols: list[str] | None = None, + #filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), + ) -> DataFrame: + genes = self.db.table(_GENES_TABLE).execute() + return genes def _execute_query(self, query: IbisTable) -> DataFrame: # TODO: Allow more options for returning results return query.distinct().execute() - def chrominfo(self) -> DataFrame: - return self.db.table("chrominfo").execute() - - def list_tables(self) -> list: - return self.db.list_tables() - - def _tables_by_degree(self, tab: list[str] = None) -> list: - if tab is None: - tab = self.list_tables() # list of table names - # check that all tables are in the database and print warning - if not set(tab).issubset(set(self.list_tables())): - missing_tables = ", ".join(set(tab) - set(self.list_tables())) - warnings.warn( - f"The following tables are not in the database: {missing_tables}.", - UserWarning, - stacklevel=2, - ) - - tab = list(set(tab) & set(self.list_tables())) # remove tables not in db - - # order tables - - table_order = { - "transcript": 1, - "cds": 2, - "gene": 3, - "splicing": 4, - "exon": 5, - "chrominfo": 6, - "metadata": 99, - } - - return sorted(tab, key=lambda x: table_order[x]) - def list_columns(self, tables: str | list[str] | None = None) -> list[str]: if tables is None: tables = self.db.list_tables() # list of table names @@ -258,34 +212,6 @@ def _clean_columns(self, columns: list[str]) -> list[str]: raise ValueError("No valid columns were found.") return cols - def _tables_for_columns(self, cols: list, start_with: str | None = None) -> list: - cols = self._clean_columns(cols) - table_list = self._tables_by_degree() # list of table names - - # remove start_with from table_list and add it to the beginning of the list - if start_with is not None: - # check if start_with is a valid table - if start_with not in table_list: - raise ValueError(f"Invalid table: {start_with}") - # remove start_with from table_list and add it to the beginning of the list - table_list.remove(start_with) - table_list = [start_with] + table_list - - tables = [] - for t in table_list: - # check if all columns are in one table - if set(cols).issubset(self.db.table(t).columns): - tables.append(t) - return tables - else: - # check if a single column is in the table - for c in cols.copy(): - if c in self.db.table(t).columns: - if t not in tables: - tables.append(t) - cols.remove(c) # remove column from list - return tables - def _build_query( self, table: Literal["gene", "tx", "exon"], @@ -314,60 +240,3 @@ def _build_query( # add filter query = query.filter(filter.convert()).select(cols) return query - - def _join_query( - self, - tables: list[str], - start_with: str, - join_type: Literal["inner", "left"] = "inner", - ) -> IbisTable: - """Join tables and return a query.""" - # check for intermediate tables - JOIN_TABLE = [ - (("gene", "tx"), "gene_id"), - (("gene", "chromosome"), "seq_name"), - (("tx", "tx2exon"), "tx_id"), - (("tx2exon", "exon"), "exon_id"), - (("tx", "protein"), "tx_id"), - (("gene", "entrezgene"), "gene_id"), - (("protein", "protein_domain"), "protein_id"), - (("protein", "uniprot"), "protein_id"), - (("uniprot", "protein_domain"), "protein_id"), - ] - tables = tables.copy() - tables.remove(start_with) - db = self.db - current_tables = [start_with] - query = db.table(start_with) - - while len(tables) > 0: - for (table_names, key), t1_name, t2_name in product( # noqa: B007 - JOIN_TABLE, current_tables, tables - ): - if t1_name in table_names and t2_name in table_names: - break - else: - raise ValueError( - f"Failed to find match for tables: {current_tables} and {tables}" - ) - - current_tables.append(t2_name) - tables.remove(t2_name) - - t2 = db.table(t2_name) - if join_type == "inner": - query = query.join(t2, predicates=[key], how="inner") - elif join_type == "left": - query = query.join( - t2, - predicates=[key], - how="left", - rname="{name}_y", - # suffixes=("", "_y"), - ) - query = query.drop(f"{key}_y") # drop duplicate columns - else: - raise ValueError(f"Invalid join type: {join_type}") - - return query - diff --git a/ucscdb.ipynb b/ucscdb.ipynb index f2bd454..131b307 100644 --- a/ucscdb.ipynb +++ b/ucscdb.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "d1cbfd54-ed0a-475a-9ab4-d990ead4fa21", "metadata": {}, "outputs": [ @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "6c41a912-e665-44a0-8df5-a670db50973b", "metadata": {}, "outputs": [], @@ -31,91 +31,1004 @@ "%autoreload 2" ] }, + { + "cell_type": "code", + "execution_count": 4, + "id": "828d4119-b66c-460e-8233-0ca84e2c8d17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "UCSCDB(organism='Homo sapiens', ucsc_track='GENCODE V44', genome='hg38', ucsc_table='knownGene')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ucscdb = gf.ucsc.annotation(species='Hsapiens', assembly='hg38', ucsc_table='knownGene', bioc_version='3.18', )\n", + "ucscdb" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ef0b732b-84af-4528-8714-c3843828f321", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Db type': 'TxDb',\n", + " 'Supporting package': 'GenomicFeatures',\n", + " 'Data source': 'UCSC',\n", + " 'Genome': 'hg38',\n", + " 'Organism': 'Homo sapiens',\n", + " 'Taxonomy ID': '9606',\n", + " 'UCSC Table': 'knownGene',\n", + " 'UCSC Track': 'GENCODE V44',\n", + " 'Resource URL': 'http://genome.ucsc.edu/',\n", + " 'Type of Gene ID': 'Entrez Gene ID',\n", + " 'Full dataset': 'yes',\n", + " 'miRBase build ID': None,\n", + " 'Nb of transcripts': '276905',\n", + " 'Db created by': 'GenomicFeatures package from Bioconductor',\n", + " 'Creation time': '2023-09-20 17:25:17 +0000 (Wed, 20 Sep 2023)',\n", + " 'GenomicFeatures version at creation time': '1.53.2',\n", + " 'RSQLite version at creation time': '2.3.1',\n", + " 'DBSCHEMAVERSION': '1.2'}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ucscdb.metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1dca4051-dc86-4654-862e-346a6e578d93", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cds ibis.Schema {\n", + " _cds_id int32\n", + " cds_name string\n", + " cds_chrom !string\n", + " cds_strand !string\n", + " cds_start !int32\n", + " cds_end !int32\n", + "}\n", + "chrominfo ibis.Schema {\n", + " _chrom_id int32\n", + " chrom !string\n", + " length int32\n", + " is_circular int32\n", + "}\n", + "exon ibis.Schema {\n", + " _exon_id int32\n", + " exon_name string\n", + " exon_chrom !string\n", + " exon_strand !string\n", + " exon_start !int32\n", + " exon_end !int32\n", + "}\n", + "gene ibis.Schema {\n", + " gene_id !string\n", + " _tx_id !int32\n", + "}\n", + "metadata ibis.Schema {\n", + " name string\n", + " value string\n", + "}\n", + "splicing ibis.Schema {\n", + " _tx_id !int32\n", + " exon_rank !int32\n", + " _exon_id !int32\n", + " _cds_id int32\n", + " cds_phase int32\n", + "}\n", + "transcript ibis.Schema {\n", + " _tx_id int32\n", + " tx_name string\n", + " tx_type string\n", + " tx_chrom !string\n", + " tx_strand !string\n", + " tx_start !int32\n", + " tx_end !int32\n", + "}\n" + ] + } + ], + "source": [ + "for tbl_name in ucscdb.db.list_tables():\n", + " print(tbl_name, ucscdb.db.table(tbl_name).schema())" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "c1d5a9cb-c884-43d0-beac-12237e0fb2da", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_tx_idtx_nametx_typetx_chromtx_strandtx_starttx_end
01ENST00000456328.2Nonechr1+1186914409
12ENST00000450305.2Nonechr1+1201013670
23ENST00000473358.1Nonechr1+2955431097
34ENST00000469289.1Nonechr1+3026731109
45ENST00000607096.1Nonechr1+3036630503
........................
276900276901ENST00000710260.1NonechrX_MU273397v1_alt-239036260095
276901276902ENST00000710028.1NonechrX_MU273397v1_alt-272358282686
276902276903ENST00000710030.1NonechrX_MU273397v1_alt-314193316302
276903276904ENST00000710216.1NonechrX_MU273397v1_alt-314813315236
276904276905ENST00000710031.1NonechrX_MU273397v1_alt-324527324923
\n", + "

276905 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " _tx_id tx_name tx_type tx_chrom tx_strand \\\n", + "0 1 ENST00000456328.2 None chr1 + \n", + "1 2 ENST00000450305.2 None chr1 + \n", + "2 3 ENST00000473358.1 None chr1 + \n", + "3 4 ENST00000469289.1 None chr1 + \n", + "4 5 ENST00000607096.1 None chr1 + \n", + "... ... ... ... ... ... \n", + "276900 276901 ENST00000710260.1 None chrX_MU273397v1_alt - \n", + "276901 276902 ENST00000710028.1 None chrX_MU273397v1_alt - \n", + "276902 276903 ENST00000710030.1 None chrX_MU273397v1_alt - \n", + "276903 276904 ENST00000710216.1 None chrX_MU273397v1_alt - \n", + "276904 276905 ENST00000710031.1 None chrX_MU273397v1_alt - \n", + "\n", + " tx_start tx_end \n", + "0 11869 14409 \n", + "1 12010 13670 \n", + "2 29554 31097 \n", + "3 30267 31109 \n", + "4 30366 30503 \n", + "... ... ... \n", + "276900 239036 260095 \n", + "276901 272358 282686 \n", + "276902 314193 316302 \n", + "276903 314813 315236 \n", + "276904 324527 324923 \n", + "\n", + "[276905 rows x 7 columns]" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ucscdb.db.table('transcript').execute()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "2dfccc7b-df65-4072-9b18-eebbd486d69b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tx_idtx_namechromstrandstartend
01ENST00000456328.2chr1+1186914409
12ENST00000450305.2chr1+1201013670
23ENST00000473358.1chr1+2955431097
34ENST00000469289.1chr1+3026731109
45ENST00000607096.1chr1+3036630503
.....................
276900276901ENST00000710260.1chrX_MU273397v1_alt-239036260095
276901276902ENST00000710028.1chrX_MU273397v1_alt-272358282686
276902276903ENST00000710030.1chrX_MU273397v1_alt-314193316302
276903276904ENST00000710216.1chrX_MU273397v1_alt-314813315236
276904276905ENST00000710031.1chrX_MU273397v1_alt-324527324923
\n", + "

276905 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " tx_id tx_name chrom strand start end\n", + "0 1 ENST00000456328.2 chr1 + 11869 14409\n", + "1 2 ENST00000450305.2 chr1 + 12010 13670\n", + "2 3 ENST00000473358.1 chr1 + 29554 31097\n", + "3 4 ENST00000469289.1 chr1 + 30267 31109\n", + "4 5 ENST00000607096.1 chr1 + 30366 30503\n", + "... ... ... ... ... ... ...\n", + "276900 276901 ENST00000710260.1 chrX_MU273397v1_alt - 239036 260095\n", + "276901 276902 ENST00000710028.1 chrX_MU273397v1_alt - 272358 282686\n", + "276902 276903 ENST00000710030.1 chrX_MU273397v1_alt - 314193 316302\n", + "276903 276904 ENST00000710216.1 chrX_MU273397v1_alt - 314813 315236\n", + "276904 276905 ENST00000710031.1 chrX_MU273397v1_alt - 324527 324923\n", + "\n", + "[276905 rows x 6 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tx = ucscdb.transcripts()\n", + "tx" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "d01654ff-75d6-415a-a37f-3fb1c6e2c02e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
exon_idchromstrandstartend
01chr1+1186912227
12chr1+1201012057
23chr1+1217912227
34chr1+1261312697
45chr1+1261312721
..................
734617734618chrX_MU273397v1_alt-314193314248
734618734619chrX_MU273397v1_alt-314813315236
734619734620chrX_MU273397v1_alt-315258315407
734620734621chrX_MU273397v1_alt-316254316302
734621734622chrX_MU273397v1_alt-324527324923
\n", + "

734622 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " exon_id chrom strand start end\n", + "0 1 chr1 + 11869 12227\n", + "1 2 chr1 + 12010 12057\n", + "2 3 chr1 + 12179 12227\n", + "3 4 chr1 + 12613 12697\n", + "4 5 chr1 + 12613 12721\n", + "... ... ... ... ... ...\n", + "734617 734618 chrX_MU273397v1_alt - 314193 314248\n", + "734618 734619 chrX_MU273397v1_alt - 314813 315236\n", + "734619 734620 chrX_MU273397v1_alt - 315258 315407\n", + "734620 734621 chrX_MU273397v1_alt - 316254 316302\n", + "734621 734622 chrX_MU273397v1_alt - 324527 324923\n", + "\n", + "[734622 rows x 5 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exons = ucscdb.exons()\n", + "exons" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "e9bb67ce-ca75-470f-8d59-520595d6229a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gene_id_tx_id
06011264219
16011264220
2100130386264224
365265271292
465265271293
.........
2355205565722459
2355215565722460
2355225565722461
2355235565722462
2355245565722463
\n", + "

235525 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " gene_id _tx_id\n", + "0 6011 264219\n", + "1 6011 264220\n", + "2 100130386 264224\n", + "3 65265 271292\n", + "4 65265 271293\n", + "... ... ...\n", + "235520 55657 22459\n", + "235521 55657 22460\n", + "235522 55657 22461\n", + "235523 55657 22462\n", + "235524 55657 22463\n", + "\n", + "[235525 rows x 2 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = ucscdb.genes()\n", + "df" + ] + }, { "cell_type": "code", "execution_count": 34, - "id": "4aea998e-7628-4dc4-b0e6-523a7eaf9783", + "id": "d1362962-a610-45f3-95c2-bec594b6f871", "metadata": {}, "outputs": [], "source": [ - "ensdb = gf.ensembl.annotation(species=\"Hsapiens\", version=\"108\")" + "s = ucscdb.db.table('splicing').execute()" ] }, { "cell_type": "code", - "execution_count": 39, - "id": "828d4119-b66c-460e-8233-0ca84e2c8d17", + "execution_count": 53, + "id": "0652e3d5-8fef-4777-b989-6b4d431d8134", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_tx_idexon_rank_exon_id_cds_idcds_phase
0111NaNNone
1125NaNNone
2138NaNNone
311639131607NaNNone
43110NaNNone
..................
17889552748472729350NaNNone
17889562748473729351NaNNone
17889572748474729352NaNNone
17889582748475729353NaNNone
17889592748621729407NaNNone
\n", + "

1788960 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " _tx_id exon_rank _exon_id _cds_id cds_phase\n", + "0 1 1 1 NaN None\n", + "1 1 2 5 NaN None\n", + "2 1 3 8 NaN None\n", + "3 11639 1 31607 NaN None\n", + "4 3 1 10 NaN None\n", + "... ... ... ... ... ...\n", + "1788955 274847 2 729350 NaN None\n", + "1788956 274847 3 729351 NaN None\n", + "1788957 274847 4 729352 NaN None\n", + "1788958 274847 5 729353 NaN None\n", + "1788959 274862 1 729407 NaN None\n", + "\n", + "[1788960 rows x 5 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "ucscdb = gf.ucsc.annotation(species='Hsapiens', assembly='hg38', ucsc_table='knownGene', bioc_version='3.18', )" + "s" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "3dcebadb-95f2-4216-ab67-fb86c155e573", + "execution_count": 55, + "id": "d4225b28-7891-464f-ab88-3853eab0746c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array(['Athaliana', 'Btaurus', 'Celegans', 'Cfamiliaris', 'Dmelanogaster',\n", - " 'Drerio', 'Ggallus', 'Hsapiens', 'Mmulatta', 'Mmusculus',\n", - " 'Ptroglodytes', 'Rnorvegicus', 'Scerevisiae', 'Sscrofa'],\n", - " dtype=object)" + "tx_id 1\n", + "tx_name ENST00000456328.2\n", + "chrom chr1\n", + "strand +\n", + "start 11869\n", + "end 14409\n", + "Name: 0, dtype: object" ] }, - "execution_count": 19, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "gf.ucsc.list_ucscdb_annotations()['species'].unique()" + "tx.loc[0]" ] }, { "cell_type": "code", - "execution_count": 23, - "id": "3de1b23f-d6b7-4d31-86f3-e16a40d7f9ce", + "execution_count": 52, + "id": "428fa2df-0960-48b5-9466-fb2f5bfd0b4d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'Btaurus Celegans Cfamiliaris Dmelanogaster Drerio Ggallus Hsapiens Mmulatta Mmusculus Ptroglodytes Rnorvegicus Scerevisiae Sscrofa'" + "966235" ] }, - "execution_count": 23, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = gf.ucsc.list_ucscdb_annotations()\n", - "' '.join(df['species'].unique())" + "sum(s['_cds_id'].isnull() == False)" ] }, { "cell_type": "code", - "execution_count": 40, - "id": "1d418c83-c248-404e-a91c-2adf73b4cc99", + "execution_count": 50, + "id": "ab1837e6-9994-4ddc-adb3-097e475af1f9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['cds', 'chrominfo', 'exon', 'gene', 'metadata', 'splicing', 'transcript']" + "0 True\n", + "1 True\n", + "2 True\n", + "3 True\n", + "4 True\n", + " ... \n", + "1788955 True\n", + "1788956 True\n", + "1788957 True\n", + "1788958 True\n", + "1788959 True\n", + "Name: _cds_id, Length: 1788960, dtype: bool" ] }, - "execution_count": 40, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ucscdb.list_tables()" + "s['_cds_id'].isnull()" ] } ], From 1f0284cc7df4721df514212135e8299c6a767334 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 27 Apr 2024 03:52:25 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/genomic_features/ucsc/ucscdb.py | 80 +++++++++++++++-------------- ucscdb.ipynb | 16 +++--- 2 files changed, 51 insertions(+), 45 deletions(-) diff --git a/src/genomic_features/ucsc/ucscdb.py b/src/genomic_features/ucsc/ucscdb.py index c7d3f3f..c37a3ec 100644 --- a/src/genomic_features/ucsc/ucscdb.py +++ b/src/genomic_features/ucsc/ucscdb.py @@ -1,11 +1,9 @@ from __future__ import annotations -import warnings -from functools import cached_property -from itertools import product import os +from functools import cached_property from pathlib import Path -from typing import Final, Literal +from typing import Literal import ibis import requests @@ -14,7 +12,6 @@ from pandas import DataFrame, Timestamp from requests.exceptions import HTTPError -from genomic_features import filters from genomic_features._core import filters as _filters from genomic_features._core.cache import retrieve_annotation @@ -28,32 +25,36 @@ ) TIMESTAMP_URL = "https://annotationhub.bioconductor.org/metadata/database_timestamp" -_TX_TABLE = 'transcript' -_EXONS_TABLE = 'exon' -_GENES_TABLE = 'gene' +_TX_TABLE = "transcript" +_EXONS_TABLE = "exon" +_GENES_TABLE = "gene" _PRETTY_NAMES = { - '_tx_id': 'tx_id', - 'tx_chrom': 'chrom', - 'tx_strand': 'strand', - 'tx_start': 'start', - 'tx_end': 'end', - '_exon_id': 'exon_id', - 'exon_chrom': 'chrom', - 'exon_strand': 'strand', - 'exon_start': 'start', - 'exon_end': 'end', + "_tx_id": "tx_id", + "tx_chrom": "chrom", + "tx_strand": "strand", + "tx_start": "start", + "tx_end": "end", + "_exon_id": "exon_id", + "exon_chrom": "chrom", + "exon_strand": "strand", + "exon_start": "start", + "exon_end": "end", } -def annotation(species: str, bioc_version: str, assembly: str, - ucsc_table: str) -> UCSCDB: + +def annotation( + species: str, bioc_version: str, assembly: str, ucsc_table: str +) -> UCSCDB: try: ucscdb = UCSCDB( ibis.sqlite.connect( - retrieve_annotation(os.path.join( - BIOC_ANNOTATION_HUB_URL, - f"ucsc/standard/{bioc_version}/TxDb.{species}.UCSC.{assembly}.{ucsc_table}.sqlite" - )) + retrieve_annotation( + os.path.join( + BIOC_ANNOTATION_HUB_URL, + f"ucsc/standard/{bioc_version}/TxDb.{species}.UCSC.{assembly}.{ucsc_table}.sqlite", + ) + ) ) ) except HTTPError as err: @@ -84,7 +85,7 @@ def list_ucscdb_annotations(species: None | str | list[str] = None) -> DataFrame ----- >>> gf.ensembl.list_ensdb_annotations("Mmusculus") """ - _COL_ORDERS = ['species', 'assembly', 'ucsc_table', 'bioc_version'] + _COL_ORDERS = ["species", "assembly", "ucsc_table", "bioc_version"] # Get latest AnnotationHub timestamp db_path = Path(retrieve_annotation(ANNOTATION_HUB_URL)) timestamp = requests.get(TIMESTAMP_URL).text @@ -98,12 +99,13 @@ def list_ucscdb_annotations(species: None | str | list[str] = None) -> DataFrame version_table = ( ahdb.table("rdatapaths").filter(deferred.rdataclass == "TxDb").execute() ) - version_table = version_table[version_table['rdatapath'].map(lambda x: x.split('/')[0] == 'ucsc')] + version_table = version_table[ + version_table["rdatapath"].map(lambda x: x.split("/")[0] == "ucsc") + ] - version_table["bioc_version"] = ( - version_table["rdatapath"] - .str.split("/", expand=True)[2] - ) + version_table["bioc_version"] = version_table["rdatapath"].str.split( + "/", expand=True + )[2] version_table["species"] = ( version_table["rdatapath"] .str.split("/", expand=True)[3] @@ -120,7 +122,7 @@ def list_ucscdb_annotations(species: None | str | list[str] = None) -> DataFrame .str.split(".", expand=True)[4] ) # `Athaliana` do not follow the normal name formatting, drop them. - version_table = version_table[version_table['ucsc_table'] != 'sqlite'] + version_table = version_table[version_table["ucsc_table"] != "sqlite"] if species is not None: if isinstance(species, str): @@ -159,28 +161,28 @@ def list_tables(self) -> list: def transcripts( self, - #cols: list[str] | None = None, - #filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), + # cols: list[str] | None = None, + # filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), ) -> DataFrame: tx = self.db.table(_TX_TABLE).execute() tx = tx.rename(columns=_PRETTY_NAMES) - tx = tx.drop('tx_type', axis=1) # always None + tx = tx.drop("tx_type", axis=1) # always None return tx def exons( self, - #cols: list[str] | None = None, - #filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), + # cols: list[str] | None = None, + # filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), ) -> DataFrame: exons = self.db.table(_EXONS_TABLE).execute() exons = exons.rename(columns=_PRETTY_NAMES) - exons = exons.drop('exon_name', axis=1) # always None + exons = exons.drop("exon_name", axis=1) # always None return exons def genes( self, - #cols: list[str] | None = None, - #filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), + # cols: list[str] | None = None, + # filter: _filters.AbstractFilterExpr = filters.EmptyFilter(), ) -> DataFrame: genes = self.db.table(_GENES_TABLE).execute() return genes diff --git a/ucscdb.ipynb b/ucscdb.ipynb index 131b307..82a171b 100644 --- a/ucscdb.ipynb +++ b/ucscdb.ipynb @@ -16,7 +16,6 @@ } ], "source": [ - "import ibis\n", "import genomic_features as gf" ] }, @@ -49,7 +48,12 @@ } ], "source": [ - "ucscdb = gf.ucsc.annotation(species='Hsapiens', assembly='hg38', ucsc_table='knownGene', bioc_version='3.18', )\n", + "ucscdb = gf.ucsc.annotation(\n", + " species=\"Hsapiens\",\n", + " assembly=\"hg38\",\n", + " ucsc_table=\"knownGene\",\n", + " bioc_version=\"3.18\",\n", + ")\n", "ucscdb" ] }, @@ -343,7 +347,7 @@ } ], "source": [ - "ucscdb.db.table('transcript').execute()" + "ucscdb.db.table(\"transcript\").execute()" ] }, { @@ -794,7 +798,7 @@ "metadata": {}, "outputs": [], "source": [ - "s = ucscdb.db.table('splicing').execute()" + "s = ucscdb.db.table(\"splicing\").execute()" ] }, { @@ -996,7 +1000,7 @@ } ], "source": [ - "sum(s['_cds_id'].isnull() == False)" + "sum(s[\"_cds_id\"].isnull() == False)" ] }, { @@ -1028,7 +1032,7 @@ } ], "source": [ - "s['_cds_id'].isnull()" + "s[\"_cds_id\"].isnull()" ] } ],