diff --git a/docs/api.md b/docs/api.md index a0d853c..9dac923 100644 --- a/docs/api.md +++ b/docs/api.md @@ -9,6 +9,7 @@ .. autosummary:: :toctree: generated + genomic_features.annotate ensembl.annotation ensembl.EnsemblDB ensembl.list_ensdb_annotations diff --git a/docs/notebooks/basic_usage.ipynb b/docs/notebooks/basic_usage.ipynb index 0a90d2f..cdd6b81 100644 --- a/docs/notebooks/basic_usage.ipynb +++ b/docs/notebooks/basic_usage.ipynb @@ -39,22 +39,32 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 19, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading data from 'https://bioconductorhubs.blob.core.windows.net/annotationhub/AHEnsDbs/v98/EnsDb.Hsapiens.v98.sqlite' to file '/home/jovyan/.cache/genomic-features/87408fa36573814f9212483e6e4a939d-EnsDb.Hsapiens.v98.sqlite'.\n", + "100%|████████████████████████████████████████| 386M/386M [00:00<00:00, 243GB/s]\n", + "SHA256 hash of downloaded file: d02484a69b97465b66121be4c4aada06490267836a12f1c725f5d29ead3e4098\n", + "Use this value as the 'known_hash' argument of 'pooch.retrieve' to ensure that the file hasn't changed if it is downloaded again in the future.\n" + ] + }, { "data": { "text/plain": [ - "EnsemblDB(organism='Homo sapiens', ensembl_release='108')" + "EnsemblDB(organism='Homo sapiens', ensembl_release='98')" ] }, - "execution_count": 2, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ensdb = gf.ensembl.annotation(species=\"Hsapiens\", version=\"108\")\n", + "ensdb = gf.ensembl.annotation(species=\"Hsapiens\", version=\"98\")\n", "ensdb" ] }, @@ -69,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -244,7 +254,7 @@ "3863 Mmusculus 109" ] }, - "execution_count": 3, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -269,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -279,17 +289,17 @@ " 'Type of Gene ID': 'Ensembl Gene ID',\n", " 'Supporting package': 'ensembldb',\n", " 'Db created by': 'ensembldb package from Bioconductor',\n", - " 'script_version': '0.3.7',\n", - " 'Creation time': 'Fri Oct 28 05:24:43 2022',\n", - " 'ensembl_version': '108',\n", + " 'script_version': '0.3.5',\n", + " 'Creation time': 'Mon Nov 18 21:12:12 2019',\n", + " 'ensembl_version': '98',\n", " 'ensembl_host': 'localhost',\n", " 'Organism': 'Homo sapiens',\n", " 'taxonomy_id': '9606',\n", " 'genome_build': 'GRCh38',\n", - " 'DBSCHEMAVERSION': '2.2'}" + " 'DBSCHEMAVERSION': '2.1'}" ] }, - "execution_count": 4, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -307,7 +317,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -341,7 +351,6 @@ " seq_coord_system\n", " description\n", " gene_id_version\n", - " canonical_transcript\n", " \n", " \n", " \n", @@ -357,7 +366,6 @@ " chromosome\n", " tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858]\n", " ENSG00000000003.15\n", - " ENST00000373020\n", " \n", " \n", " 1\n", @@ -371,7 +379,6 @@ " chromosome\n", " tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757]\n", " ENSG00000000005.6\n", - " ENST00000373031\n", " \n", " \n", " 2\n", @@ -379,13 +386,12 @@ " DPM1\n", " protein_coding\n", " 50934867\n", - " 50959140\n", + " 50958555\n", " 20\n", " -1\n", " chromosome\n", " dolichyl-phosphate mannosyltransferase subunit...\n", - " ENSG00000000419.14\n", - " ENST00000371588\n", + " ENSG00000000419.12\n", " \n", " \n", " 3\n", @@ -399,7 +405,6 @@ " chromosome\n", " SCY1 like pseudokinase 3 [Source:HGNC Symbol;A...\n", " ENSG00000000457.14\n", - " ENST00000367771\n", " \n", " \n", " 4\n", @@ -413,7 +418,6 @@ " chromosome\n", " chromosome 1 open reading frame 112 [Source:HG...\n", " ENSG00000000460.17\n", - " ENST00000359326\n", " \n", " \n", "\n", @@ -423,7 +427,7 @@ " gene_id gene_name gene_biotype gene_seq_start gene_seq_end \n", "0 ENSG00000000003 TSPAN6 protein_coding 100627108 100639991 \\\n", "1 ENSG00000000005 TNMD protein_coding 100584936 100599885 \n", - "2 ENSG00000000419 DPM1 protein_coding 50934867 50959140 \n", + "2 ENSG00000000419 DPM1 protein_coding 50934867 50958555 \n", "3 ENSG00000000457 SCYL3 protein_coding 169849631 169894267 \n", "4 ENSG00000000460 C1orf112 protein_coding 169662007 169854080 \n", "\n", @@ -434,22 +438,15 @@ "3 1 -1 chromosome \n", "4 1 1 chromosome \n", "\n", - " description gene_id_version \n", - "0 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] ENSG00000000003.15 \\\n", - "1 tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] ENSG00000000005.6 \n", - "2 dolichyl-phosphate mannosyltransferase subunit... ENSG00000000419.14 \n", - "3 SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... ENSG00000000457.14 \n", - "4 chromosome 1 open reading frame 112 [Source:HG... ENSG00000000460.17 \n", - "\n", - " canonical_transcript \n", - "0 ENST00000373020 \n", - "1 ENST00000373031 \n", - "2 ENST00000371588 \n", - "3 ENST00000367771 \n", - "4 ENST00000359326 " + " description gene_id_version \n", + "0 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] ENSG00000000003.15 \n", + "1 tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] ENSG00000000005.6 \n", + "2 dolichyl-phosphate mannosyltransferase subunit... ENSG00000000419.12 \n", + "3 SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... ENSG00000000457.14 \n", + "4 chromosome 1 open reading frame 112 [Source:HG... ENSG00000000460.17 " ] }, - "execution_count": 5, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -475,7 +472,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -509,7 +506,6 @@ " seq_coord_system\n", " description\n", " gene_id_version\n", - " canonical_transcript\n", " \n", " \n", " \n", @@ -525,7 +521,6 @@ " chromosome\n", " mitochondrially encoded tRNA-Leu (UUA/G) 1 [So...\n", " ENSG00000209082.1\n", - " ENST00000386347\n", " \n", " \n", " 1\n", @@ -539,7 +534,6 @@ " chromosome\n", " mitochondrially encoded tRNA-Phe (UUU/C) [Sour...\n", " ENSG00000210049.1\n", - " ENST00000387314\n", " \n", " \n", " 2\n", @@ -553,7 +547,6 @@ " chromosome\n", " mitochondrially encoded tRNA-Val (GUN) [Source...\n", " ENSG00000210077.1\n", - " ENST00000387342\n", " \n", " \n", " 3\n", @@ -567,7 +560,6 @@ " chromosome\n", " mitochondrially encoded tRNA-Ile (AUU/C) [Sour...\n", " ENSG00000210100.1\n", - " ENST00000387365\n", " \n", " \n", " 4\n", @@ -581,7 +573,6 @@ " chromosome\n", " mitochondrially encoded tRNA-Gln (CAA/G) [Sour...\n", " ENSG00000210107.1\n", - " ENST00000387372\n", " \n", " \n", "\n", @@ -602,22 +593,15 @@ "3 MT 1 chromosome \n", "4 MT -1 chromosome \n", "\n", - " description gene_id_version \n", - "0 mitochondrially encoded tRNA-Leu (UUA/G) 1 [So... ENSG00000209082.1 \\\n", - "1 mitochondrially encoded tRNA-Phe (UUU/C) [Sour... ENSG00000210049.1 \n", - "2 mitochondrially encoded tRNA-Val (GUN) [Source... ENSG00000210077.1 \n", - "3 mitochondrially encoded tRNA-Ile (AUU/C) [Sour... ENSG00000210100.1 \n", - "4 mitochondrially encoded tRNA-Gln (CAA/G) [Sour... ENSG00000210107.1 \n", - "\n", - " canonical_transcript \n", - "0 ENST00000386347 \n", - "1 ENST00000387314 \n", - "2 ENST00000387342 \n", - "3 ENST00000387365 \n", - "4 ENST00000387372 " + " description gene_id_version \n", + "0 mitochondrially encoded tRNA-Leu (UUA/G) 1 [So... ENSG00000209082.1 \n", + "1 mitochondrially encoded tRNA-Phe (UUU/C) [Sour... ENSG00000210049.1 \n", + "2 mitochondrially encoded tRNA-Val (GUN) [Source... ENSG00000210077.1 \n", + "3 mitochondrially encoded tRNA-Ile (AUU/C) [Sour... ENSG00000210100.1 \n", + "4 mitochondrially encoded tRNA-Gln (CAA/G) [Sour... ENSG00000210107.1 " ] }, - "execution_count": 6, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -635,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -669,7 +653,6 @@ " seq_coord_system\n", " description\n", " gene_id_version\n", - " canonical_transcript\n", " \n", " \n", " \n", @@ -678,14 +661,13 @@ " ENSG00000223972\n", " DDX11L1\n", " transcribed_unprocessed_pseudogene\n", - " 12010\n", - " 13670\n", + " 11869\n", + " 14409\n", " 1\n", " 1\n", " chromosome\n", - " DEAD/H-box helicase 11 like 1 (pseudogene) [So...\n", - " ENSG00000223972.6\n", - " ENST00000450305\n", + " DEAD/H-box helicase 11 like 1 [Source:HGNC Sym...\n", + " ENSG00000223972.5\n", " \n", " \n", " 1\n", @@ -699,7 +681,6 @@ " chromosome\n", " WASP family homolog 7, pseudogene [Source:HGNC...\n", " ENSG00000227232.5\n", - " ENST00000488147\n", " \n", " \n", " 2\n", @@ -713,21 +694,6 @@ " chromosome\n", " microRNA 6859-1 [Source:HGNC Symbol;Acc:HGNC:5...\n", " ENSG00000278267.1\n", - " ENST00000619216\n", - " \n", - " \n", - " 3\n", - " ENSG00000290825\n", - " DDX11L2\n", - " lncRNA\n", - " 11869\n", - " 14409\n", - " 1\n", - " 1\n", - " chromosome\n", - " DEAD/H-box helicase 11 like 2 (pseudogene) [So...\n", - " ENSG00000290825.1\n", - " ENST00000456328\n", " \n", " \n", "\n", @@ -738,28 +704,19 @@ "0 ENSG00000223972 DDX11L1 transcribed_unprocessed_pseudogene \\\n", "1 ENSG00000227232 WASH7P unprocessed_pseudogene \n", "2 ENSG00000278267 MIR6859-1 miRNA \n", - "3 ENSG00000290825 DDX11L2 lncRNA \n", "\n", " gene_seq_start gene_seq_end seq_name seq_strand seq_coord_system \n", - "0 12010 13670 1 1 chromosome \\\n", + "0 11869 14409 1 1 chromosome \\\n", "1 14404 29570 1 -1 chromosome \n", "2 17369 17436 1 -1 chromosome \n", - "3 11869 14409 1 1 chromosome \n", - "\n", - " description gene_id_version \n", - "0 DEAD/H-box helicase 11 like 1 (pseudogene) [So... ENSG00000223972.6 \\\n", - "1 WASP family homolog 7, pseudogene [Source:HGNC... ENSG00000227232.5 \n", - "2 microRNA 6859-1 [Source:HGNC Symbol;Acc:HGNC:5... ENSG00000278267.1 \n", - "3 DEAD/H-box helicase 11 like 2 (pseudogene) [So... ENSG00000290825.1 \n", "\n", - " canonical_transcript \n", - "0 ENST00000450305 \n", - "1 ENST00000488147 \n", - "2 ENST00000619216 \n", - "3 ENST00000456328 " + " description gene_id_version \n", + "0 DEAD/H-box helicase 11 like 1 [Source:HGNC Sym... ENSG00000223972.5 \n", + "1 WASP family homolog 7, pseudogene [Source:HGNC... ENSG00000227232.5 \n", + "2 microRNA 6859-1 [Source:HGNC Symbol;Acc:HGNC:5... ENSG00000278267.1 " ] }, - "execution_count": 7, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -777,7 +734,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -811,43 +768,20 @@ " seq_coord_system\n", " description\n", " gene_id_version\n", - " canonical_transcript\n", " \n", " \n", " \n", - " \n", - " 0\n", - " ENSG00000290825\n", - " DDX11L2\n", - " lncRNA\n", - " 11869\n", - " 14409\n", - " 1\n", - " 1\n", - " chromosome\n", - " DEAD/H-box helicase 11 like 2 (pseudogene) [So...\n", - " ENSG00000290825.1\n", - " ENST00000456328\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " gene_id gene_name gene_biotype gene_seq_start gene_seq_end \n", - "0 ENSG00000290825 DDX11L2 lncRNA 11869 14409 \\\n", - "\n", - " seq_name seq_strand seq_coord_system \n", - "0 1 1 chromosome \\\n", - "\n", - " description gene_id_version \n", - "0 DEAD/H-box helicase 11 like 2 (pseudogene) [So... ENSG00000290825.1 \\\n", - "\n", - " canonical_transcript \n", - "0 ENST00000456328 " + "Empty DataFrame\n", + "Columns: [gene_id, gene_name, gene_biotype, gene_seq_start, gene_seq_end, seq_name, seq_strand, seq_coord_system, description, gene_id_version]\n", + "Index: []" ] }, - "execution_count": 8, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -874,7 +808,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -912,7 +846,7 @@ " ENST00000373020\n", " TSPAN6\n", " ENSP00000362111\n", - " O43657.176\n", + " O43657\n", " \n", " \n", " 1\n", @@ -920,7 +854,7 @@ " ENST00000612152\n", " TSPAN6\n", " ENSP00000482130\n", - " A0A087WYV6.48\n", + " A0A087WYV6\n", " \n", " \n", " 2\n", @@ -928,7 +862,7 @@ " ENST00000614008\n", " TSPAN6\n", " ENSP00000482894\n", - " A0A087WZU5.51\n", + " A0A087WZU5\n", " \n", " \n", " 3\n", @@ -936,30 +870,30 @@ " ENST00000373031\n", " TNMD\n", " ENSP00000362122\n", - " Q9H2S6.148\n", + " Q9H2S6\n", " \n", " \n", " 4\n", " ENSG00000000419\n", - " ENST00000466152\n", + " ENST00000371588\n", " DPM1\n", - " ENSP00000507119\n", - " A0A804HIK9.2\n", + " ENSP00000360644\n", + " A0A0S2Z4Y5\n", " \n", " \n", "\n", "" ], "text/plain": [ - " gene_id tx_id gene_name protein_id uniprot_id\n", - "0 ENSG00000000003 ENST00000373020 TSPAN6 ENSP00000362111 O43657.176\n", - "1 ENSG00000000003 ENST00000612152 TSPAN6 ENSP00000482130 A0A087WYV6.48\n", - "2 ENSG00000000003 ENST00000614008 TSPAN6 ENSP00000482894 A0A087WZU5.51\n", - "3 ENSG00000000005 ENST00000373031 TNMD ENSP00000362122 Q9H2S6.148\n", - "4 ENSG00000000419 ENST00000466152 DPM1 ENSP00000507119 A0A804HIK9.2" + " gene_id tx_id gene_name protein_id uniprot_id\n", + "0 ENSG00000000003 ENST00000373020 TSPAN6 ENSP00000362111 O43657\n", + "1 ENSG00000000003 ENST00000612152 TSPAN6 ENSP00000482130 A0A087WYV6\n", + "2 ENSG00000000003 ENST00000614008 TSPAN6 ENSP00000482894 A0A087WZU5\n", + "3 ENSG00000000005 ENST00000373031 TNMD ENSP00000362122 Q9H2S6\n", + "4 ENSG00000000419 ENST00000371588 DPM1 ENSP00000360644 A0A0S2Z4Y5" ] }, - "execution_count": 9, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -984,7 +918,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 28, "metadata": { "scrolled": true }, @@ -1053,38 +987,38 @@ " ...\n", " \n", " \n", - " 452\n", - " LRG_741\n", - " 231167\n", + " 449\n", + " LRG_311\n", + " 115492\n", " 0\n", " \n", " \n", - " 453\n", - " LRG_763\n", - " 176286\n", + " 450\n", + " LRG_721\n", + " 33396\n", " 0\n", " \n", " \n", - " 454\n", - " LRG_792\n", - " 42144\n", + " 451\n", + " LRG_741\n", + " 231167\n", " 0\n", " \n", " \n", - " 455\n", - " LRG_793\n", - " 38439\n", + " 452\n", + " LRG_763\n", + " 176286\n", " 0\n", " \n", " \n", - " 456\n", + " 453\n", " LRG_93\n", " 22459\n", " 0\n", " \n", " \n", "\n", - "

457 rows × 3 columns

\n", + "

454 rows × 3 columns

\n", "" ], "text/plain": [ @@ -1095,16 +1029,16 @@ "3 6 170805979 0\n", "4 3 198295559 0\n", ".. ... ... ...\n", - "452 LRG_741 231167 0\n", - "453 LRG_763 176286 0\n", - "454 LRG_792 42144 0\n", - "455 LRG_793 38439 0\n", - "456 LRG_93 22459 0\n", + "449 LRG_311 115492 0\n", + "450 LRG_721 33396 0\n", + "451 LRG_741 231167 0\n", + "452 LRG_763 176286 0\n", + "453 LRG_93 22459 0\n", "\n", - "[457 rows x 3 columns]" + "[454 rows x 3 columns]" ] }, - "execution_count": 10, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1117,123 +1051,52 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Adding annotations to an AnnData object:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import scanpy as sc\n", - "pbmc = sc.datasets.pbmc3k()" + "## Adding annotations to an AnnData object\n", + "\n", + "We provide a utility function `annotate` to merge a table of feature annotations from an scverse dataset and a gene annotation table. In this example we use subsampled scRNA-seq data from [Litviňuková et al. 2020](https://www.nature.com/articles/s41586-020-2797-4), where reads were mapped to the hg38 reference genome with [CellRanger v3.0.1](https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/3.0), which uses the Ensembl release 98 for gene annotations." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
gene_ids
index
MIR1302-10ENSG00000243485
FAM138AENSG00000237613
OR4F5ENSG00000186092
RP11-34P13.7ENSG00000238009
RP11-34P13.8ENSG00000239945
\n", - "
" - ], - "text/plain": [ - " gene_ids\n", - "index \n", - "MIR1302-10 ENSG00000243485\n", - "FAM138A ENSG00000237613\n", - "OR4F5 ENSG00000186092\n", - "RP11-34P13.7 ENSG00000238009\n", - "RP11-34P13.8 ENSG00000239945" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34mINFO \u001b[0m File data/hca_subsampled_20k.h5ad already downloaded \n" + ] } ], "source": [ - "pbmc.var.head()" + "import pandas as pd\n", + "import scvi\n", + "adata = scvi.data.heart_cell_atlas_subsampled()" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ - "pbmc.var = (\n", - " pd.merge(\n", - " (\n", - " pbmc.var\n", - " .reset_index()\n", - " .rename(columns={\"gene_ids\": \"gene_id\", \"index\": \"orig_gene_name\"})\n", - " ),\n", - " genes,\n", - " on=\"gene_id\",\n", - " how=\"left\",\n", - " )\n", - " .set_index(\"gene_id\")\n", - ")" + "adata.var = adata.var[['gene_ids-Harvard-Nuclei']]" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 30, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_985/997274008.py:1: UserWarning: Missing annotations for 156/26662 vars\n", + " annotated_var = gf.annotate(adata.var, genes, var_on='gene_ids-Harvard-Nuclei')\n" + ] + }, { "data": { "text/html": [ @@ -1255,7 +1118,8 @@ " \n", " \n", " \n", - " orig_gene_name\n", + " gene_ids-Harvard-Nuclei\n", + " gene_id\n", " gene_name\n", " gene_biotype\n", " gene_seq_start\n", @@ -1265,154 +1129,262 @@ " seq_coord_system\n", " description\n", " gene_id_version\n", - " canonical_transcript\n", - " \n", - " \n", - " gene_id\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", - " ENSG00000243485\n", - " MIR1302-10\n", - " MIR1302-2HG\n", + " AL627309.1\n", + " ENSG00000238009\n", + " ENSG00000238009\n", + " AL627309.1\n", " lncRNA\n", - " 29554.0\n", - " 31109.0\n", + " 89295.0\n", + " 133723.0\n", " 1\n", - " 1.0\n", + " -1.0\n", " chromosome\n", - " MIR1302-2 host gene [Source:HGNC Symbol;Acc:HG...\n", - " ENSG00000243485.5\n", - " ENST00000473358\n", + " novel transcript\n", + " ENSG00000238009.6\n", " \n", " \n", - " ENSG00000237613\n", - " FAM138A\n", - " FAM138A\n", + " AC114498.1\n", + " ENSG00000235146\n", + " ENSG00000235146\n", + " AC114498.1\n", " lncRNA\n", - " 34554.0\n", - " 36081.0\n", + " 587629.0\n", + " 594768.0\n", " 1\n", - " -1.0\n", + " 1.0\n", " chromosome\n", - " family with sequence similarity 138 member A [...\n", - " ENSG00000237613.2\n", - " ENST00000417324\n", + " novel transcript\n", + " ENSG00000235146.2\n", " \n", " \n", - " ENSG00000186092\n", - " OR4F5\n", - " OR4F5\n", - " protein_coding\n", - " 65419.0\n", - " 71585.0\n", + " AL669831.2\n", + " ENSG00000229905\n", + " ENSG00000229905\n", + " AL669831.2\n", + " lncRNA\n", + " 760911.0\n", + " 761989.0\n", " 1\n", " 1.0\n", " chromosome\n", - " olfactory receptor family 4 subfamily F member...\n", - " ENSG00000186092.7\n", - " ENST00000641515\n", + " novel transcript\n", + " ENSG00000229905.1\n", " \n", " \n", - " ENSG00000238009\n", - " RP11-34P13.7\n", - " \n", + " AL669831.5\n", + " ENSG00000237491\n", + " ENSG00000237491\n", + " LINC01409\n", " lncRNA\n", - " 89295.0\n", - " 133723.0\n", + " 778747.0\n", + " 810065.0\n", " 1\n", - " -1.0\n", + " 1.0\n", " chromosome\n", - " novel transcript\n", - " ENSG00000238009.6\n", - " ENST00000477740\n", + " long intergenic non-protein coding RNA 1409 [S...\n", + " ENSG00000237491.10\n", " \n", " \n", - " ENSG00000239945\n", - " RP11-34P13.8\n", - " \n", + " FAM87B\n", + " ENSG00000177757\n", + " ENSG00000177757\n", + " FAM87B\n", " lncRNA\n", - " 89551.0\n", - " 91105.0\n", + " 817371.0\n", + " 819837.0\n", " 1\n", - " -1.0\n", + " 1.0\n", " chromosome\n", - " novel transcript\n", - " ENSG00000239945.1\n", - " ENST00000495576\n", + " family with sequence similarity 87 member B [S...\n", + " ENSG00000177757.2\n", + " \n", + " \n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " AC007325.2\n", + " ENSG00000277196\n", + " ENSG00000277196\n", + " AC007325.2\n", + " protein_coding\n", + " 138082.0\n", + " 161852.0\n", + " KI270734.1\n", + " -1.0\n", + " scaffold\n", + " proline dehydrogenase 1, mitochondrial [Source...\n", + " ENSG00000277196.4\n", + " \n", + " \n", + " BX072566.1\n", + " ENSG00000277630\n", + " ENSG00000277630\n", + " BX072566.1\n", + " protein_coding\n", + " 108007.0\n", + " 139659.0\n", + " GL000213.1\n", + " -1.0\n", + " scaffold\n", + " POTE ankyrin domain family member D-like [Sour...\n", + " ENSG00000277630.4\n", + " \n", + " \n", + " AL354822.1\n", + " ENSG00000278384\n", + " ENSG00000278384\n", + " AL354822.1\n", + " protein_coding\n", + " 51867.0\n", + " 54893.0\n", + " GL000218.1\n", + " -1.0\n", + " scaffold\n", + " NULL\n", + " ENSG00000278384.1\n", + " \n", + " \n", + " AC004556.1\n", + " ENSG00000276345\n", + " ENSG00000276345\n", + " AC004556.3\n", + " protein_coding\n", + " 2585.0\n", + " 11802.0\n", + " KI270721.1\n", + " 1.0\n", + " scaffold\n", + " 39S ribosomal protein L23, mitochondrial [Sour...\n", + " ENSG00000276345.1\n", + " \n", + " \n", + " AC240274.1\n", + " ENSG00000271254\n", + " ENSG00000271254\n", + " AC240274.1\n", + " protein_coding\n", + " 4612.0\n", + " 29626.0\n", + " KI270711.1\n", + " -1.0\n", + " scaffold\n", + " neuroblastoma breakpoint family member 1 [Sour...\n", + " ENSG00000271254.6\n", " \n", " \n", "\n", + "

26662 rows × 11 columns

\n", "" ], "text/plain": [ - " orig_gene_name gene_name gene_biotype gene_seq_start \n", - "gene_id \n", - "ENSG00000243485 MIR1302-10 MIR1302-2HG lncRNA 29554.0 \\\n", - "ENSG00000237613 FAM138A FAM138A lncRNA 34554.0 \n", - "ENSG00000186092 OR4F5 OR4F5 protein_coding 65419.0 \n", - "ENSG00000238009 RP11-34P13.7 lncRNA 89295.0 \n", - "ENSG00000239945 RP11-34P13.8 lncRNA 89551.0 \n", + " gene_ids-Harvard-Nuclei gene_id gene_name \n", + "AL627309.1 ENSG00000238009 ENSG00000238009 AL627309.1 \\\n", + "AC114498.1 ENSG00000235146 ENSG00000235146 AC114498.1 \n", + "AL669831.2 ENSG00000229905 ENSG00000229905 AL669831.2 \n", + "AL669831.5 ENSG00000237491 ENSG00000237491 LINC01409 \n", + "FAM87B ENSG00000177757 ENSG00000177757 FAM87B \n", + "... ... ... ... \n", + "AC007325.2 ENSG00000277196 ENSG00000277196 AC007325.2 \n", + "BX072566.1 ENSG00000277630 ENSG00000277630 BX072566.1 \n", + "AL354822.1 ENSG00000278384 ENSG00000278384 AL354822.1 \n", + "AC004556.1 ENSG00000276345 ENSG00000276345 AC004556.3 \n", + "AC240274.1 ENSG00000271254 ENSG00000271254 AC240274.1 \n", "\n", - " gene_seq_end seq_name seq_strand seq_coord_system \n", - "gene_id \n", - "ENSG00000243485 31109.0 1 1.0 chromosome \\\n", - "ENSG00000237613 36081.0 1 -1.0 chromosome \n", - "ENSG00000186092 71585.0 1 1.0 chromosome \n", - "ENSG00000238009 133723.0 1 -1.0 chromosome \n", - "ENSG00000239945 91105.0 1 -1.0 chromosome \n", + " gene_biotype gene_seq_start gene_seq_end seq_name \n", + "AL627309.1 lncRNA 89295.0 133723.0 1 \\\n", + "AC114498.1 lncRNA 587629.0 594768.0 1 \n", + "AL669831.2 lncRNA 760911.0 761989.0 1 \n", + "AL669831.5 lncRNA 778747.0 810065.0 1 \n", + "FAM87B lncRNA 817371.0 819837.0 1 \n", + "... ... ... ... ... \n", + "AC007325.2 protein_coding 138082.0 161852.0 KI270734.1 \n", + "BX072566.1 protein_coding 108007.0 139659.0 GL000213.1 \n", + "AL354822.1 protein_coding 51867.0 54893.0 GL000218.1 \n", + "AC004556.1 protein_coding 2585.0 11802.0 KI270721.1 \n", + "AC240274.1 protein_coding 4612.0 29626.0 KI270711.1 \n", "\n", - " description \n", - "gene_id \n", - "ENSG00000243485 MIR1302-2 host gene [Source:HGNC Symbol;Acc:HG... \\\n", - "ENSG00000237613 family with sequence similarity 138 member A [... \n", - "ENSG00000186092 olfactory receptor family 4 subfamily F member... \n", - "ENSG00000238009 novel transcript \n", - "ENSG00000239945 novel transcript \n", + " seq_strand seq_coord_system \n", + "AL627309.1 -1.0 chromosome \\\n", + "AC114498.1 1.0 chromosome \n", + "AL669831.2 1.0 chromosome \n", + "AL669831.5 1.0 chromosome \n", + "FAM87B 1.0 chromosome \n", + "... ... ... \n", + "AC007325.2 -1.0 scaffold \n", + "BX072566.1 -1.0 scaffold \n", + "AL354822.1 -1.0 scaffold \n", + "AC004556.1 1.0 scaffold \n", + "AC240274.1 -1.0 scaffold \n", "\n", - " gene_id_version canonical_transcript \n", - "gene_id \n", - "ENSG00000243485 ENSG00000243485.5 ENST00000473358 \n", - "ENSG00000237613 ENSG00000237613.2 ENST00000417324 \n", - "ENSG00000186092 ENSG00000186092.7 ENST00000641515 \n", - "ENSG00000238009 ENSG00000238009.6 ENST00000477740 \n", - "ENSG00000239945 ENSG00000239945.1 ENST00000495576 " + " description \n", + "AL627309.1 novel transcript \\\n", + "AC114498.1 novel transcript \n", + "AL669831.2 novel transcript \n", + "AL669831.5 long intergenic non-protein coding RNA 1409 [S... \n", + "FAM87B family with sequence similarity 87 member B [S... \n", + "... ... \n", + "AC007325.2 proline dehydrogenase 1, mitochondrial [Source... \n", + "BX072566.1 POTE ankyrin domain family member D-like [Sour... \n", + "AL354822.1 NULL \n", + "AC004556.1 39S ribosomal protein L23, mitochondrial [Sour... \n", + "AC240274.1 neuroblastoma breakpoint family member 1 [Sour... \n", + "\n", + " gene_id_version \n", + "AL627309.1 ENSG00000238009.6 \n", + "AC114498.1 ENSG00000235146.2 \n", + "AL669831.2 ENSG00000229905.1 \n", + "AL669831.5 ENSG00000237491.10 \n", + "FAM87B ENSG00000177757.2 \n", + "... ... \n", + "AC007325.2 ENSG00000277196.4 \n", + "BX072566.1 ENSG00000277630.4 \n", + "AL354822.1 ENSG00000278384.1 \n", + "AC004556.1 ENSG00000276345.1 \n", + "AC240274.1 ENSG00000271254.6 \n", + "\n", + "[26662 rows x 11 columns]" ] }, - "execution_count": 14, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pbmc.var.head()" + "annotated_var = gf.annotate(adata.var, genes, var_on='gene_ids-Harvard-Nuclei')\n", + "annotated_var" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "adata.var = annotated_var.copy()" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:genomic-annotations-dev]", + "display_name": "Python 3", "language": "python", - "name": "conda-env-genomic-annotations-dev-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1424,12 +1396,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" - }, - "vscode": { - "interpreter": { - "hash": "ae6466e8d4f517858789b5c9e8f0ed238fb8964458a36305fca7bddc149e9c64" - } + "version": "3.8.8" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index 8ce0ee5..466dd88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ doc = [ test = [ "pytest", "pytest-cov", + "anndata" ] [tool.coverage.run] diff --git a/src/genomic_features/__init__.py b/src/genomic_features/__init__.py index 679dc43..e6440ee 100644 --- a/src/genomic_features/__init__.py +++ b/src/genomic_features/__init__.py @@ -1,6 +1,7 @@ from importlib.metadata import version from . import ensembl, filters +from .annotate import annotate __all__ = ["ensembl"] diff --git a/src/genomic_features/annotate.py b/src/genomic_features/annotate.py new file mode 100644 index 0000000..330f568 --- /dev/null +++ b/src/genomic_features/annotate.py @@ -0,0 +1,114 @@ +import warnings + +import pandas as pd + + +def annotate( + adata_var: pd.DataFrame, + annotation_df: pd.DataFrame, + var_on: str = None, + annotation_on: str = None, +) -> pd.DataFrame: + """ + Annotate variables from AnnData object with a DataFrame of annotations. + + Parameters + ---------- + adata_var + The AnnData.var DataFrame. + annotation_df + The DataFrame of annotations (e.g. output of `EnsemblDB.genes()`). + var_on + The column name in `adata_var` to join var_on (useful if geneIDs are not stored in `adata.var_names`). + If `None`, tables are joined based var_on index of `adata_var`. + annotation_on + The column name in `annotation_df` to join var_on. If `None`, the column name is inferred from the + `annotation_df` DataFrame (i.e. the column with unique values ending in `_id`). + + Returns + ------- + annotated_var + The annotated AnnData.var DataFrame. + """ + # Pick column with IDs in annotation table + if annotation_on is None: + annotation_on = [ + i + for i in annotation_df.columns + if annotation_df[i].is_unique and i.endswith("_id") + ] + if len(annotation_on) == 0: + raise ValueError( + "No unique ID column found in annotation_df - specify ID column with `annotation_on` parameter" + ) + if len(annotation_on) > 1: + raise ValueError( + "Multiple unique ID columns found in annotation_df - specify ID column with `annotation_on` parameter" + ) + else: + annotation_on = annotation_on[0] + else: + if not annotation_df[annotation_on].is_unique: + raise ValueError(f"Column {annotation_on} does not contain unique IDs") + + adata_var_merge = adata_var.copy() + + # Check for common columns between adata_var and annotation_df + # (these will be overwritten by the merge) + common_cols = annotation_df.columns[ + annotation_df.columns.isin(adata_var_merge.columns) + ].tolist() + if len(common_cols) > 0: + warnings.warn( + f"Columns {common_cols} are present in both adata_var and annotation_df - these will be overwritten", + stacklevel=2, + ) + adata_var_merge.drop(common_cols, axis=1, inplace=True) + + # Pick IDs in adata_var table + if var_on is None: + assert ( + adata_var_merge.index.is_unique + ), "var_names do not contain unique IDs - specify ID column with `var_on` parameter" + index_name = adata_var_merge.index.name + adata_var_merge = adata_var_merge.reset_index(names="var_names") + else: + if var_on not in adata_var_merge.columns: + raise ValueError( + "Column specified by `var_on` does not exist in adata_var_merge" + ) + if not adata_var_merge[var_on].is_unique: + raise ValueError("Column specified by `var_on` does not contain unique IDs") + adata_var_merge["var_names"] = adata_var_merge[var_on].copy() + + # Merge + annotated_var = ( + adata_var_merge.reset_index() + .merge( + annotation_df, how="left", right_on=[annotation_on], left_on=["var_names"] + ) + .set_index("index") + ) + + # Retain order and var_names + if var_on is None: + annotated_var = annotated_var.set_index("var_names") + annotated_var = annotated_var.loc[adata_var_merge["var_names"]] + annotated_var.index.name = index_name + else: + annotated_var = annotated_var.loc[adata_var_merge.index] + annotated_var = annotated_var.drop("var_names", axis=1) + + # Check for missing genes + missing_vars = annotated_var.index[ + annotated_var[annotation_df.drop([annotation_on], axis=1).columns] + .isna() + .all(axis=1) + ] + if len(missing_vars) > 0: + warnings.warn( + f"Missing annotations for {len(missing_vars)}/{annotated_var.shape[0]} vars", + stacklevel=2, + ) + + return annotated_var diff --git a/tests/test_annotate_anndata.py b/tests/test_annotate_anndata.py new file mode 100644 index 0000000..481e73f --- /dev/null +++ b/tests/test_annotate_anndata.py @@ -0,0 +1,92 @@ +import numpy as np +import pandas as pd +import pytest +from anndata import AnnData + +import genomic_features as gf + + +@pytest.fixture(scope="module") +def genes(): + return gf.ensembl.annotation("Hsapiens", 108).genes() + + +@pytest.fixture(scope="module") +def adata(): + return AnnData( + X=np.array([[1, 2, 3], [4, 5, 6]]), + obs=pd.DataFrame(index=["cell1", "cell2"]), + var=pd.DataFrame( + index=["ENSG00000278862", "ENSG00000212694", "ENSG00000113721"] + ), + ) + + +@pytest.fixture(scope="module") +def annotated_adata_var(adata, genes): + return gf.annotate(adata.var, genes) + + +def test_output(annotated_adata_var): + assert isinstance(annotated_adata_var, pd.DataFrame) + + +def test_size(annotated_adata_var, adata): + assert annotated_adata_var.shape[0] == adata.n_vars + + +def test_var_names(annotated_adata_var, adata): + assert "var_names" not in annotated_adata_var + assert annotated_adata_var.index.equals(adata.var.index) + assert annotated_adata_var.index.name == adata.var.index.name + + +def test_anndata_changes(adata, genes): + gf.annotate(adata.var, genes) + assert "gene_name" not in adata.var + adata.var["gene_ids"] = adata.var_names.copy() + gf.annotate(adata.var, genes, var_on="gene_ids") + assert "gene_name" not in adata.var + + +def test_missing_genes(genes): + # Test missing genes get NAs + adata_custom_genes = AnnData( + X=np.array([[1, 2, 3, 5], [4, 5, 6, 8]]), + obs=pd.DataFrame(index=["cell1", "cell2"]), + var=pd.DataFrame( + index=["ENSG00000278862", "ENSG00000212694", "ENSG00000113721", "GENE1"] + ), + ) + with pytest.warns(UserWarning): + annotated_adata_var = gf.annotate(adata_custom_genes.var, genes) + assert annotated_adata_var.loc["GENE1"][genes.columns].isna().all() + + +def test_unique_ids(genes, adata): + adata_duplicate_genes = AnnData( + X=np.array([[1, 2, 3, 5], [4, 5, 6, 8]]), + obs=pd.DataFrame(index=["cell1", "cell2"]), + var=pd.DataFrame( + index=[ + "ENSG00000278862", + "ENSG00000212694", + "ENSG00000113721", + "ENSG00000212694", + ] + ), + ) + with pytest.raises(AssertionError): + gf.annotate(adata_duplicate_genes.var, genes) + with pytest.raises(ValueError): + gf.annotate(adata.var, genes, annotation_on="gene_name") + + +def test_clashing_columns(adata, genes): + # Test that common columns are dropped + adata.var["gene_biotype"] = "foo" + adata.var["seq_name"] = "chr1" + with pytest.warns(UserWarning, match=r"gene_biotype.+seq_name"): + annotated_var = gf.annotate(adata.var, genes) + assert "lncRNA" in annotated_var["gene_biotype"].values + assert "12" in annotated_var["seq_name"].values