diff --git a/docs/api.md b/docs/api.md
index a0d853c..9dac923 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -9,6 +9,7 @@
.. autosummary::
:toctree: generated
+ genomic_features.annotate
ensembl.annotation
ensembl.EnsemblDB
ensembl.list_ensdb_annotations
diff --git a/docs/notebooks/basic_usage.ipynb b/docs/notebooks/basic_usage.ipynb
index 0a90d2f..cdd6b81 100644
--- a/docs/notebooks/basic_usage.ipynb
+++ b/docs/notebooks/basic_usage.ipynb
@@ -39,22 +39,32 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 19,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Downloading data from 'https://bioconductorhubs.blob.core.windows.net/annotationhub/AHEnsDbs/v98/EnsDb.Hsapiens.v98.sqlite' to file '/home/jovyan/.cache/genomic-features/87408fa36573814f9212483e6e4a939d-EnsDb.Hsapiens.v98.sqlite'.\n",
+ "100%|████████████████████████████████████████| 386M/386M [00:00<00:00, 243GB/s]\n",
+ "SHA256 hash of downloaded file: d02484a69b97465b66121be4c4aada06490267836a12f1c725f5d29ead3e4098\n",
+ "Use this value as the 'known_hash' argument of 'pooch.retrieve' to ensure that the file hasn't changed if it is downloaded again in the future.\n"
+ ]
+ },
{
"data": {
"text/plain": [
- "EnsemblDB(organism='Homo sapiens', ensembl_release='108')"
+ "EnsemblDB(organism='Homo sapiens', ensembl_release='98')"
]
},
- "execution_count": 2,
+ "execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "ensdb = gf.ensembl.annotation(species=\"Hsapiens\", version=\"108\")\n",
+ "ensdb = gf.ensembl.annotation(species=\"Hsapiens\", version=\"98\")\n",
"ensdb"
]
},
@@ -69,7 +79,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 21,
"metadata": {},
"outputs": [
{
@@ -244,7 +254,7 @@
"3863 Mmusculus 109"
]
},
- "execution_count": 3,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@@ -269,7 +279,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 22,
"metadata": {},
"outputs": [
{
@@ -279,17 +289,17 @@
" 'Type of Gene ID': 'Ensembl Gene ID',\n",
" 'Supporting package': 'ensembldb',\n",
" 'Db created by': 'ensembldb package from Bioconductor',\n",
- " 'script_version': '0.3.7',\n",
- " 'Creation time': 'Fri Oct 28 05:24:43 2022',\n",
- " 'ensembl_version': '108',\n",
+ " 'script_version': '0.3.5',\n",
+ " 'Creation time': 'Mon Nov 18 21:12:12 2019',\n",
+ " 'ensembl_version': '98',\n",
" 'ensembl_host': 'localhost',\n",
" 'Organism': 'Homo sapiens',\n",
" 'taxonomy_id': '9606',\n",
" 'genome_build': 'GRCh38',\n",
- " 'DBSCHEMAVERSION': '2.2'}"
+ " 'DBSCHEMAVERSION': '2.1'}"
]
},
- "execution_count": 4,
+ "execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
@@ -307,7 +317,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 23,
"metadata": {},
"outputs": [
{
@@ -341,7 +351,6 @@
"
seq_coord_system | \n",
" description | \n",
" gene_id_version | \n",
- " canonical_transcript | \n",
" \n",
" \n",
" \n",
@@ -357,7 +366,6 @@
" chromosome | \n",
" tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] | \n",
" ENSG00000000003.15 | \n",
- " ENST00000373020 | \n",
" \n",
" \n",
" 1 | \n",
@@ -371,7 +379,6 @@
" chromosome | \n",
" tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] | \n",
" ENSG00000000005.6 | \n",
- " ENST00000373031 | \n",
"
\n",
" \n",
" 2 | \n",
@@ -379,13 +386,12 @@
" DPM1 | \n",
" protein_coding | \n",
" 50934867 | \n",
- " 50959140 | \n",
+ " 50958555 | \n",
" 20 | \n",
" -1 | \n",
" chromosome | \n",
" dolichyl-phosphate mannosyltransferase subunit... | \n",
- " ENSG00000000419.14 | \n",
- " ENST00000371588 | \n",
+ " ENSG00000000419.12 | \n",
"
\n",
" \n",
" 3 | \n",
@@ -399,7 +405,6 @@
" chromosome | \n",
" SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... | \n",
" ENSG00000000457.14 | \n",
- " ENST00000367771 | \n",
"
\n",
" \n",
" 4 | \n",
@@ -413,7 +418,6 @@
" chromosome | \n",
" chromosome 1 open reading frame 112 [Source:HG... | \n",
" ENSG00000000460.17 | \n",
- " ENST00000359326 | \n",
"
\n",
" \n",
"\n",
@@ -423,7 +427,7 @@
" gene_id gene_name gene_biotype gene_seq_start gene_seq_end \n",
"0 ENSG00000000003 TSPAN6 protein_coding 100627108 100639991 \\\n",
"1 ENSG00000000005 TNMD protein_coding 100584936 100599885 \n",
- "2 ENSG00000000419 DPM1 protein_coding 50934867 50959140 \n",
+ "2 ENSG00000000419 DPM1 protein_coding 50934867 50958555 \n",
"3 ENSG00000000457 SCYL3 protein_coding 169849631 169894267 \n",
"4 ENSG00000000460 C1orf112 protein_coding 169662007 169854080 \n",
"\n",
@@ -434,22 +438,15 @@
"3 1 -1 chromosome \n",
"4 1 1 chromosome \n",
"\n",
- " description gene_id_version \n",
- "0 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] ENSG00000000003.15 \\\n",
- "1 tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] ENSG00000000005.6 \n",
- "2 dolichyl-phosphate mannosyltransferase subunit... ENSG00000000419.14 \n",
- "3 SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... ENSG00000000457.14 \n",
- "4 chromosome 1 open reading frame 112 [Source:HG... ENSG00000000460.17 \n",
- "\n",
- " canonical_transcript \n",
- "0 ENST00000373020 \n",
- "1 ENST00000373031 \n",
- "2 ENST00000371588 \n",
- "3 ENST00000367771 \n",
- "4 ENST00000359326 "
+ " description gene_id_version \n",
+ "0 tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858] ENSG00000000003.15 \n",
+ "1 tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757] ENSG00000000005.6 \n",
+ "2 dolichyl-phosphate mannosyltransferase subunit... ENSG00000000419.12 \n",
+ "3 SCY1 like pseudokinase 3 [Source:HGNC Symbol;A... ENSG00000000457.14 \n",
+ "4 chromosome 1 open reading frame 112 [Source:HG... ENSG00000000460.17 "
]
},
- "execution_count": 5,
+ "execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
@@ -475,7 +472,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 24,
"metadata": {},
"outputs": [
{
@@ -509,7 +506,6 @@
" seq_coord_system | \n",
" description | \n",
" gene_id_version | \n",
- " canonical_transcript | \n",
" \n",
" \n",
" \n",
@@ -525,7 +521,6 @@
" chromosome | \n",
" mitochondrially encoded tRNA-Leu (UUA/G) 1 [So... | \n",
" ENSG00000209082.1 | \n",
- " ENST00000386347 | \n",
" \n",
" \n",
" 1 | \n",
@@ -539,7 +534,6 @@
" chromosome | \n",
" mitochondrially encoded tRNA-Phe (UUU/C) [Sour... | \n",
" ENSG00000210049.1 | \n",
- " ENST00000387314 | \n",
"
\n",
" \n",
" 2 | \n",
@@ -553,7 +547,6 @@
" chromosome | \n",
" mitochondrially encoded tRNA-Val (GUN) [Source... | \n",
" ENSG00000210077.1 | \n",
- " ENST00000387342 | \n",
"
\n",
" \n",
" 3 | \n",
@@ -567,7 +560,6 @@
" chromosome | \n",
" mitochondrially encoded tRNA-Ile (AUU/C) [Sour... | \n",
" ENSG00000210100.1 | \n",
- " ENST00000387365 | \n",
"
\n",
" \n",
" 4 | \n",
@@ -581,7 +573,6 @@
" chromosome | \n",
" mitochondrially encoded tRNA-Gln (CAA/G) [Sour... | \n",
" ENSG00000210107.1 | \n",
- " ENST00000387372 | \n",
"
\n",
" \n",
"\n",
@@ -602,22 +593,15 @@
"3 MT 1 chromosome \n",
"4 MT -1 chromosome \n",
"\n",
- " description gene_id_version \n",
- "0 mitochondrially encoded tRNA-Leu (UUA/G) 1 [So... ENSG00000209082.1 \\\n",
- "1 mitochondrially encoded tRNA-Phe (UUU/C) [Sour... ENSG00000210049.1 \n",
- "2 mitochondrially encoded tRNA-Val (GUN) [Source... ENSG00000210077.1 \n",
- "3 mitochondrially encoded tRNA-Ile (AUU/C) [Sour... ENSG00000210100.1 \n",
- "4 mitochondrially encoded tRNA-Gln (CAA/G) [Sour... ENSG00000210107.1 \n",
- "\n",
- " canonical_transcript \n",
- "0 ENST00000386347 \n",
- "1 ENST00000387314 \n",
- "2 ENST00000387342 \n",
- "3 ENST00000387365 \n",
- "4 ENST00000387372 "
+ " description gene_id_version \n",
+ "0 mitochondrially encoded tRNA-Leu (UUA/G) 1 [So... ENSG00000209082.1 \n",
+ "1 mitochondrially encoded tRNA-Phe (UUU/C) [Sour... ENSG00000210049.1 \n",
+ "2 mitochondrially encoded tRNA-Val (GUN) [Source... ENSG00000210077.1 \n",
+ "3 mitochondrially encoded tRNA-Ile (AUU/C) [Sour... ENSG00000210100.1 \n",
+ "4 mitochondrially encoded tRNA-Gln (CAA/G) [Sour... ENSG00000210107.1 "
]
},
- "execution_count": 6,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@@ -635,7 +619,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 25,
"metadata": {},
"outputs": [
{
@@ -669,7 +653,6 @@
" seq_coord_system | \n",
" description | \n",
" gene_id_version | \n",
- " canonical_transcript | \n",
" \n",
" \n",
" \n",
@@ -678,14 +661,13 @@
" ENSG00000223972 | \n",
" DDX11L1 | \n",
" transcribed_unprocessed_pseudogene | \n",
- " 12010 | \n",
- " 13670 | \n",
+ " 11869 | \n",
+ " 14409 | \n",
" 1 | \n",
" 1 | \n",
" chromosome | \n",
- " DEAD/H-box helicase 11 like 1 (pseudogene) [So... | \n",
- " ENSG00000223972.6 | \n",
- " ENST00000450305 | \n",
+ " DEAD/H-box helicase 11 like 1 [Source:HGNC Sym... | \n",
+ " ENSG00000223972.5 | \n",
" \n",
" \n",
" 1 | \n",
@@ -699,7 +681,6 @@
" chromosome | \n",
" WASP family homolog 7, pseudogene [Source:HGNC... | \n",
" ENSG00000227232.5 | \n",
- " ENST00000488147 | \n",
"
\n",
" \n",
" 2 | \n",
@@ -713,21 +694,6 @@
" chromosome | \n",
" microRNA 6859-1 [Source:HGNC Symbol;Acc:HGNC:5... | \n",
" ENSG00000278267.1 | \n",
- " ENST00000619216 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " ENSG00000290825 | \n",
- " DDX11L2 | \n",
- " lncRNA | \n",
- " 11869 | \n",
- " 14409 | \n",
- " 1 | \n",
- " 1 | \n",
- " chromosome | \n",
- " DEAD/H-box helicase 11 like 2 (pseudogene) [So... | \n",
- " ENSG00000290825.1 | \n",
- " ENST00000456328 | \n",
"
\n",
" \n",
"\n",
@@ -738,28 +704,19 @@
"0 ENSG00000223972 DDX11L1 transcribed_unprocessed_pseudogene \\\n",
"1 ENSG00000227232 WASH7P unprocessed_pseudogene \n",
"2 ENSG00000278267 MIR6859-1 miRNA \n",
- "3 ENSG00000290825 DDX11L2 lncRNA \n",
"\n",
" gene_seq_start gene_seq_end seq_name seq_strand seq_coord_system \n",
- "0 12010 13670 1 1 chromosome \\\n",
+ "0 11869 14409 1 1 chromosome \\\n",
"1 14404 29570 1 -1 chromosome \n",
"2 17369 17436 1 -1 chromosome \n",
- "3 11869 14409 1 1 chromosome \n",
- "\n",
- " description gene_id_version \n",
- "0 DEAD/H-box helicase 11 like 1 (pseudogene) [So... ENSG00000223972.6 \\\n",
- "1 WASP family homolog 7, pseudogene [Source:HGNC... ENSG00000227232.5 \n",
- "2 microRNA 6859-1 [Source:HGNC Symbol;Acc:HGNC:5... ENSG00000278267.1 \n",
- "3 DEAD/H-box helicase 11 like 2 (pseudogene) [So... ENSG00000290825.1 \n",
"\n",
- " canonical_transcript \n",
- "0 ENST00000450305 \n",
- "1 ENST00000488147 \n",
- "2 ENST00000619216 \n",
- "3 ENST00000456328 "
+ " description gene_id_version \n",
+ "0 DEAD/H-box helicase 11 like 1 [Source:HGNC Sym... ENSG00000223972.5 \n",
+ "1 WASP family homolog 7, pseudogene [Source:HGNC... ENSG00000227232.5 \n",
+ "2 microRNA 6859-1 [Source:HGNC Symbol;Acc:HGNC:5... ENSG00000278267.1 "
]
},
- "execution_count": 7,
+ "execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
@@ -777,7 +734,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 26,
"metadata": {},
"outputs": [
{
@@ -811,43 +768,20 @@
" seq_coord_system | \n",
" description | \n",
" gene_id_version | \n",
- " canonical_transcript | \n",
" \n",
" \n",
" \n",
- " \n",
- " 0 | \n",
- " ENSG00000290825 | \n",
- " DDX11L2 | \n",
- " lncRNA | \n",
- " 11869 | \n",
- " 14409 | \n",
- " 1 | \n",
- " 1 | \n",
- " chromosome | \n",
- " DEAD/H-box helicase 11 like 2 (pseudogene) [So... | \n",
- " ENSG00000290825.1 | \n",
- " ENST00000456328 | \n",
- "
\n",
" \n",
"\n",
""
],
"text/plain": [
- " gene_id gene_name gene_biotype gene_seq_start gene_seq_end \n",
- "0 ENSG00000290825 DDX11L2 lncRNA 11869 14409 \\\n",
- "\n",
- " seq_name seq_strand seq_coord_system \n",
- "0 1 1 chromosome \\\n",
- "\n",
- " description gene_id_version \n",
- "0 DEAD/H-box helicase 11 like 2 (pseudogene) [So... ENSG00000290825.1 \\\n",
- "\n",
- " canonical_transcript \n",
- "0 ENST00000456328 "
+ "Empty DataFrame\n",
+ "Columns: [gene_id, gene_name, gene_biotype, gene_seq_start, gene_seq_end, seq_name, seq_strand, seq_coord_system, description, gene_id_version]\n",
+ "Index: []"
]
},
- "execution_count": 8,
+ "execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
@@ -874,7 +808,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 27,
"metadata": {},
"outputs": [
{
@@ -912,7 +846,7 @@
" ENST00000373020 | \n",
" TSPAN6 | \n",
" ENSP00000362111 | \n",
- " O43657.176 | \n",
+ " O43657 | \n",
" \n",
" \n",
" 1 | \n",
@@ -920,7 +854,7 @@
" ENST00000612152 | \n",
" TSPAN6 | \n",
" ENSP00000482130 | \n",
- " A0A087WYV6.48 | \n",
+ " A0A087WYV6 | \n",
"
\n",
" \n",
" 2 | \n",
@@ -928,7 +862,7 @@
" ENST00000614008 | \n",
" TSPAN6 | \n",
" ENSP00000482894 | \n",
- " A0A087WZU5.51 | \n",
+ " A0A087WZU5 | \n",
"
\n",
" \n",
" 3 | \n",
@@ -936,30 +870,30 @@
" ENST00000373031 | \n",
" TNMD | \n",
" ENSP00000362122 | \n",
- " Q9H2S6.148 | \n",
+ " Q9H2S6 | \n",
"
\n",
" \n",
" 4 | \n",
" ENSG00000000419 | \n",
- " ENST00000466152 | \n",
+ " ENST00000371588 | \n",
" DPM1 | \n",
- " ENSP00000507119 | \n",
- " A0A804HIK9.2 | \n",
+ " ENSP00000360644 | \n",
+ " A0A0S2Z4Y5 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " gene_id tx_id gene_name protein_id uniprot_id\n",
- "0 ENSG00000000003 ENST00000373020 TSPAN6 ENSP00000362111 O43657.176\n",
- "1 ENSG00000000003 ENST00000612152 TSPAN6 ENSP00000482130 A0A087WYV6.48\n",
- "2 ENSG00000000003 ENST00000614008 TSPAN6 ENSP00000482894 A0A087WZU5.51\n",
- "3 ENSG00000000005 ENST00000373031 TNMD ENSP00000362122 Q9H2S6.148\n",
- "4 ENSG00000000419 ENST00000466152 DPM1 ENSP00000507119 A0A804HIK9.2"
+ " gene_id tx_id gene_name protein_id uniprot_id\n",
+ "0 ENSG00000000003 ENST00000373020 TSPAN6 ENSP00000362111 O43657\n",
+ "1 ENSG00000000003 ENST00000612152 TSPAN6 ENSP00000482130 A0A087WYV6\n",
+ "2 ENSG00000000003 ENST00000614008 TSPAN6 ENSP00000482894 A0A087WZU5\n",
+ "3 ENSG00000000005 ENST00000373031 TNMD ENSP00000362122 Q9H2S6\n",
+ "4 ENSG00000000419 ENST00000371588 DPM1 ENSP00000360644 A0A0S2Z4Y5"
]
},
- "execution_count": 9,
+ "execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
@@ -984,7 +918,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 28,
"metadata": {
"scrolled": true
},
@@ -1053,38 +987,38 @@
" ... | \n",
" \n",
" \n",
- " 452 | \n",
- " LRG_741 | \n",
- " 231167 | \n",
+ " 449 | \n",
+ " LRG_311 | \n",
+ " 115492 | \n",
" 0 | \n",
"
\n",
" \n",
- " 453 | \n",
- " LRG_763 | \n",
- " 176286 | \n",
+ " 450 | \n",
+ " LRG_721 | \n",
+ " 33396 | \n",
" 0 | \n",
"
\n",
" \n",
- " 454 | \n",
- " LRG_792 | \n",
- " 42144 | \n",
+ " 451 | \n",
+ " LRG_741 | \n",
+ " 231167 | \n",
" 0 | \n",
"
\n",
" \n",
- " 455 | \n",
- " LRG_793 | \n",
- " 38439 | \n",
+ " 452 | \n",
+ " LRG_763 | \n",
+ " 176286 | \n",
" 0 | \n",
"
\n",
" \n",
- " 456 | \n",
+ " 453 | \n",
" LRG_93 | \n",
" 22459 | \n",
" 0 | \n",
"
\n",
" \n",
"\n",
- "457 rows × 3 columns
\n",
+ "454 rows × 3 columns
\n",
""
],
"text/plain": [
@@ -1095,16 +1029,16 @@
"3 6 170805979 0\n",
"4 3 198295559 0\n",
".. ... ... ...\n",
- "452 LRG_741 231167 0\n",
- "453 LRG_763 176286 0\n",
- "454 LRG_792 42144 0\n",
- "455 LRG_793 38439 0\n",
- "456 LRG_93 22459 0\n",
+ "449 LRG_311 115492 0\n",
+ "450 LRG_721 33396 0\n",
+ "451 LRG_741 231167 0\n",
+ "452 LRG_763 176286 0\n",
+ "453 LRG_93 22459 0\n",
"\n",
- "[457 rows x 3 columns]"
+ "[454 rows x 3 columns]"
]
},
- "execution_count": 10,
+ "execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
@@ -1117,123 +1051,52 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Adding annotations to an AnnData object:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import scanpy as sc\n",
- "pbmc = sc.datasets.pbmc3k()"
+ "## Adding annotations to an AnnData object\n",
+ "\n",
+ "We provide a utility function `annotate` to merge a table of feature annotations from an scverse dataset and a gene annotation table. In this example we use subsampled scRNA-seq data from [Litviňuková et al. 2020](https://www.nature.com/articles/s41586-020-2797-4), where reads were mapped to the hg38 reference genome with [CellRanger v3.0.1](https://support.10xgenomics.com/single-cell-gene-expression/software/downloads/3.0), which uses the Ensembl release 98 for gene annotations."
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " gene_ids | \n",
- "
\n",
- " \n",
- " index | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " MIR1302-10 | \n",
- " ENSG00000243485 | \n",
- "
\n",
- " \n",
- " FAM138A | \n",
- " ENSG00000237613 | \n",
- "
\n",
- " \n",
- " OR4F5 | \n",
- " ENSG00000186092 | \n",
- "
\n",
- " \n",
- " RP11-34P13.7 | \n",
- " ENSG00000238009 | \n",
- "
\n",
- " \n",
- " RP11-34P13.8 | \n",
- " ENSG00000239945 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " gene_ids\n",
- "index \n",
- "MIR1302-10 ENSG00000243485\n",
- "FAM138A ENSG00000237613\n",
- "OR4F5 ENSG00000186092\n",
- "RP11-34P13.7 ENSG00000238009\n",
- "RP11-34P13.8 ENSG00000239945"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[34mINFO \u001b[0m File data/hca_subsampled_20k.h5ad already downloaded \n"
+ ]
}
],
"source": [
- "pbmc.var.head()"
+ "import pandas as pd\n",
+ "import scvi\n",
+ "adata = scvi.data.heart_cell_atlas_subsampled()"
]
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
- "pbmc.var = (\n",
- " pd.merge(\n",
- " (\n",
- " pbmc.var\n",
- " .reset_index()\n",
- " .rename(columns={\"gene_ids\": \"gene_id\", \"index\": \"orig_gene_name\"})\n",
- " ),\n",
- " genes,\n",
- " on=\"gene_id\",\n",
- " how=\"left\",\n",
- " )\n",
- " .set_index(\"gene_id\")\n",
- ")"
+ "adata.var = adata.var[['gene_ids-Harvard-Nuclei']]"
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 30,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_985/997274008.py:1: UserWarning: Missing annotations for 156/26662 vars\n",
+ " annotated_var = gf.annotate(adata.var, genes, var_on='gene_ids-Harvard-Nuclei')\n"
+ ]
+ },
{
"data": {
"text/html": [
@@ -1255,7 +1118,8 @@
" \n",
" \n",
" | \n",
- " orig_gene_name | \n",
+ " gene_ids-Harvard-Nuclei | \n",
+ " gene_id | \n",
" gene_name | \n",
" gene_biotype | \n",
" gene_seq_start | \n",
@@ -1265,154 +1129,262 @@
" seq_coord_system | \n",
" description | \n",
" gene_id_version | \n",
- " canonical_transcript | \n",
- "
\n",
- " \n",
- " gene_id | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
"
\n",
" \n",
" \n",
" \n",
- " ENSG00000243485 | \n",
- " MIR1302-10 | \n",
- " MIR1302-2HG | \n",
+ " AL627309.1 | \n",
+ " ENSG00000238009 | \n",
+ " ENSG00000238009 | \n",
+ " AL627309.1 | \n",
" lncRNA | \n",
- " 29554.0 | \n",
- " 31109.0 | \n",
+ " 89295.0 | \n",
+ " 133723.0 | \n",
" 1 | \n",
- " 1.0 | \n",
+ " -1.0 | \n",
" chromosome | \n",
- " MIR1302-2 host gene [Source:HGNC Symbol;Acc:HG... | \n",
- " ENSG00000243485.5 | \n",
- " ENST00000473358 | \n",
+ " novel transcript | \n",
+ " ENSG00000238009.6 | \n",
"
\n",
" \n",
- " ENSG00000237613 | \n",
- " FAM138A | \n",
- " FAM138A | \n",
+ " AC114498.1 | \n",
+ " ENSG00000235146 | \n",
+ " ENSG00000235146 | \n",
+ " AC114498.1 | \n",
" lncRNA | \n",
- " 34554.0 | \n",
- " 36081.0 | \n",
+ " 587629.0 | \n",
+ " 594768.0 | \n",
" 1 | \n",
- " -1.0 | \n",
+ " 1.0 | \n",
" chromosome | \n",
- " family with sequence similarity 138 member A [... | \n",
- " ENSG00000237613.2 | \n",
- " ENST00000417324 | \n",
+ " novel transcript | \n",
+ " ENSG00000235146.2 | \n",
"
\n",
" \n",
- " ENSG00000186092 | \n",
- " OR4F5 | \n",
- " OR4F5 | \n",
- " protein_coding | \n",
- " 65419.0 | \n",
- " 71585.0 | \n",
+ " AL669831.2 | \n",
+ " ENSG00000229905 | \n",
+ " ENSG00000229905 | \n",
+ " AL669831.2 | \n",
+ " lncRNA | \n",
+ " 760911.0 | \n",
+ " 761989.0 | \n",
" 1 | \n",
" 1.0 | \n",
" chromosome | \n",
- " olfactory receptor family 4 subfamily F member... | \n",
- " ENSG00000186092.7 | \n",
- " ENST00000641515 | \n",
+ " novel transcript | \n",
+ " ENSG00000229905.1 | \n",
"
\n",
" \n",
- " ENSG00000238009 | \n",
- " RP11-34P13.7 | \n",
- " | \n",
+ " AL669831.5 | \n",
+ " ENSG00000237491 | \n",
+ " ENSG00000237491 | \n",
+ " LINC01409 | \n",
" lncRNA | \n",
- " 89295.0 | \n",
- " 133723.0 | \n",
+ " 778747.0 | \n",
+ " 810065.0 | \n",
" 1 | \n",
- " -1.0 | \n",
+ " 1.0 | \n",
" chromosome | \n",
- " novel transcript | \n",
- " ENSG00000238009.6 | \n",
- " ENST00000477740 | \n",
+ " long intergenic non-protein coding RNA 1409 [S... | \n",
+ " ENSG00000237491.10 | \n",
"
\n",
" \n",
- " ENSG00000239945 | \n",
- " RP11-34P13.8 | \n",
- " | \n",
+ " FAM87B | \n",
+ " ENSG00000177757 | \n",
+ " ENSG00000177757 | \n",
+ " FAM87B | \n",
" lncRNA | \n",
- " 89551.0 | \n",
- " 91105.0 | \n",
+ " 817371.0 | \n",
+ " 819837.0 | \n",
" 1 | \n",
- " -1.0 | \n",
+ " 1.0 | \n",
" chromosome | \n",
- " novel transcript | \n",
- " ENSG00000239945.1 | \n",
- " ENST00000495576 | \n",
+ " family with sequence similarity 87 member B [S... | \n",
+ " ENSG00000177757.2 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " AC007325.2 | \n",
+ " ENSG00000277196 | \n",
+ " ENSG00000277196 | \n",
+ " AC007325.2 | \n",
+ " protein_coding | \n",
+ " 138082.0 | \n",
+ " 161852.0 | \n",
+ " KI270734.1 | \n",
+ " -1.0 | \n",
+ " scaffold | \n",
+ " proline dehydrogenase 1, mitochondrial [Source... | \n",
+ " ENSG00000277196.4 | \n",
+ "
\n",
+ " \n",
+ " BX072566.1 | \n",
+ " ENSG00000277630 | \n",
+ " ENSG00000277630 | \n",
+ " BX072566.1 | \n",
+ " protein_coding | \n",
+ " 108007.0 | \n",
+ " 139659.0 | \n",
+ " GL000213.1 | \n",
+ " -1.0 | \n",
+ " scaffold | \n",
+ " POTE ankyrin domain family member D-like [Sour... | \n",
+ " ENSG00000277630.4 | \n",
+ "
\n",
+ " \n",
+ " AL354822.1 | \n",
+ " ENSG00000278384 | \n",
+ " ENSG00000278384 | \n",
+ " AL354822.1 | \n",
+ " protein_coding | \n",
+ " 51867.0 | \n",
+ " 54893.0 | \n",
+ " GL000218.1 | \n",
+ " -1.0 | \n",
+ " scaffold | \n",
+ " NULL | \n",
+ " ENSG00000278384.1 | \n",
+ "
\n",
+ " \n",
+ " AC004556.1 | \n",
+ " ENSG00000276345 | \n",
+ " ENSG00000276345 | \n",
+ " AC004556.3 | \n",
+ " protein_coding | \n",
+ " 2585.0 | \n",
+ " 11802.0 | \n",
+ " KI270721.1 | \n",
+ " 1.0 | \n",
+ " scaffold | \n",
+ " 39S ribosomal protein L23, mitochondrial [Sour... | \n",
+ " ENSG00000276345.1 | \n",
+ "
\n",
+ " \n",
+ " AC240274.1 | \n",
+ " ENSG00000271254 | \n",
+ " ENSG00000271254 | \n",
+ " AC240274.1 | \n",
+ " protein_coding | \n",
+ " 4612.0 | \n",
+ " 29626.0 | \n",
+ " KI270711.1 | \n",
+ " -1.0 | \n",
+ " scaffold | \n",
+ " neuroblastoma breakpoint family member 1 [Sour... | \n",
+ " ENSG00000271254.6 | \n",
"
\n",
" \n",
"\n",
+ "26662 rows × 11 columns
\n",
""
],
"text/plain": [
- " orig_gene_name gene_name gene_biotype gene_seq_start \n",
- "gene_id \n",
- "ENSG00000243485 MIR1302-10 MIR1302-2HG lncRNA 29554.0 \\\n",
- "ENSG00000237613 FAM138A FAM138A lncRNA 34554.0 \n",
- "ENSG00000186092 OR4F5 OR4F5 protein_coding 65419.0 \n",
- "ENSG00000238009 RP11-34P13.7 lncRNA 89295.0 \n",
- "ENSG00000239945 RP11-34P13.8 lncRNA 89551.0 \n",
+ " gene_ids-Harvard-Nuclei gene_id gene_name \n",
+ "AL627309.1 ENSG00000238009 ENSG00000238009 AL627309.1 \\\n",
+ "AC114498.1 ENSG00000235146 ENSG00000235146 AC114498.1 \n",
+ "AL669831.2 ENSG00000229905 ENSG00000229905 AL669831.2 \n",
+ "AL669831.5 ENSG00000237491 ENSG00000237491 LINC01409 \n",
+ "FAM87B ENSG00000177757 ENSG00000177757 FAM87B \n",
+ "... ... ... ... \n",
+ "AC007325.2 ENSG00000277196 ENSG00000277196 AC007325.2 \n",
+ "BX072566.1 ENSG00000277630 ENSG00000277630 BX072566.1 \n",
+ "AL354822.1 ENSG00000278384 ENSG00000278384 AL354822.1 \n",
+ "AC004556.1 ENSG00000276345 ENSG00000276345 AC004556.3 \n",
+ "AC240274.1 ENSG00000271254 ENSG00000271254 AC240274.1 \n",
"\n",
- " gene_seq_end seq_name seq_strand seq_coord_system \n",
- "gene_id \n",
- "ENSG00000243485 31109.0 1 1.0 chromosome \\\n",
- "ENSG00000237613 36081.0 1 -1.0 chromosome \n",
- "ENSG00000186092 71585.0 1 1.0 chromosome \n",
- "ENSG00000238009 133723.0 1 -1.0 chromosome \n",
- "ENSG00000239945 91105.0 1 -1.0 chromosome \n",
+ " gene_biotype gene_seq_start gene_seq_end seq_name \n",
+ "AL627309.1 lncRNA 89295.0 133723.0 1 \\\n",
+ "AC114498.1 lncRNA 587629.0 594768.0 1 \n",
+ "AL669831.2 lncRNA 760911.0 761989.0 1 \n",
+ "AL669831.5 lncRNA 778747.0 810065.0 1 \n",
+ "FAM87B lncRNA 817371.0 819837.0 1 \n",
+ "... ... ... ... ... \n",
+ "AC007325.2 protein_coding 138082.0 161852.0 KI270734.1 \n",
+ "BX072566.1 protein_coding 108007.0 139659.0 GL000213.1 \n",
+ "AL354822.1 protein_coding 51867.0 54893.0 GL000218.1 \n",
+ "AC004556.1 protein_coding 2585.0 11802.0 KI270721.1 \n",
+ "AC240274.1 protein_coding 4612.0 29626.0 KI270711.1 \n",
"\n",
- " description \n",
- "gene_id \n",
- "ENSG00000243485 MIR1302-2 host gene [Source:HGNC Symbol;Acc:HG... \\\n",
- "ENSG00000237613 family with sequence similarity 138 member A [... \n",
- "ENSG00000186092 olfactory receptor family 4 subfamily F member... \n",
- "ENSG00000238009 novel transcript \n",
- "ENSG00000239945 novel transcript \n",
+ " seq_strand seq_coord_system \n",
+ "AL627309.1 -1.0 chromosome \\\n",
+ "AC114498.1 1.0 chromosome \n",
+ "AL669831.2 1.0 chromosome \n",
+ "AL669831.5 1.0 chromosome \n",
+ "FAM87B 1.0 chromosome \n",
+ "... ... ... \n",
+ "AC007325.2 -1.0 scaffold \n",
+ "BX072566.1 -1.0 scaffold \n",
+ "AL354822.1 -1.0 scaffold \n",
+ "AC004556.1 1.0 scaffold \n",
+ "AC240274.1 -1.0 scaffold \n",
"\n",
- " gene_id_version canonical_transcript \n",
- "gene_id \n",
- "ENSG00000243485 ENSG00000243485.5 ENST00000473358 \n",
- "ENSG00000237613 ENSG00000237613.2 ENST00000417324 \n",
- "ENSG00000186092 ENSG00000186092.7 ENST00000641515 \n",
- "ENSG00000238009 ENSG00000238009.6 ENST00000477740 \n",
- "ENSG00000239945 ENSG00000239945.1 ENST00000495576 "
+ " description \n",
+ "AL627309.1 novel transcript \\\n",
+ "AC114498.1 novel transcript \n",
+ "AL669831.2 novel transcript \n",
+ "AL669831.5 long intergenic non-protein coding RNA 1409 [S... \n",
+ "FAM87B family with sequence similarity 87 member B [S... \n",
+ "... ... \n",
+ "AC007325.2 proline dehydrogenase 1, mitochondrial [Source... \n",
+ "BX072566.1 POTE ankyrin domain family member D-like [Sour... \n",
+ "AL354822.1 NULL \n",
+ "AC004556.1 39S ribosomal protein L23, mitochondrial [Sour... \n",
+ "AC240274.1 neuroblastoma breakpoint family member 1 [Sour... \n",
+ "\n",
+ " gene_id_version \n",
+ "AL627309.1 ENSG00000238009.6 \n",
+ "AC114498.1 ENSG00000235146.2 \n",
+ "AL669831.2 ENSG00000229905.1 \n",
+ "AL669831.5 ENSG00000237491.10 \n",
+ "FAM87B ENSG00000177757.2 \n",
+ "... ... \n",
+ "AC007325.2 ENSG00000277196.4 \n",
+ "BX072566.1 ENSG00000277630.4 \n",
+ "AL354822.1 ENSG00000278384.1 \n",
+ "AC004556.1 ENSG00000276345.1 \n",
+ "AC240274.1 ENSG00000271254.6 \n",
+ "\n",
+ "[26662 rows x 11 columns]"
]
},
- "execution_count": 14,
+ "execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "pbmc.var.head()"
+ "annotated_var = gf.annotate(adata.var, genes, var_on='gene_ids-Harvard-Nuclei')\n",
+ "annotated_var"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 28,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "adata.var = annotated_var.copy()"
+ ]
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python [conda env:genomic-annotations-dev]",
+ "display_name": "Python 3",
"language": "python",
- "name": "conda-env-genomic-annotations-dev-py"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -1424,12 +1396,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.16"
- },
- "vscode": {
- "interpreter": {
- "hash": "ae6466e8d4f517858789b5c9e8f0ed238fb8964458a36305fca7bddc149e9c64"
- }
+ "version": "3.8.8"
}
},
"nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
index 8ce0ee5..466dd88 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,6 +53,7 @@ doc = [
test = [
"pytest",
"pytest-cov",
+ "anndata"
]
[tool.coverage.run]
diff --git a/src/genomic_features/__init__.py b/src/genomic_features/__init__.py
index 679dc43..e6440ee 100644
--- a/src/genomic_features/__init__.py
+++ b/src/genomic_features/__init__.py
@@ -1,6 +1,7 @@
from importlib.metadata import version
from . import ensembl, filters
+from .annotate import annotate
__all__ = ["ensembl"]
diff --git a/src/genomic_features/annotate.py b/src/genomic_features/annotate.py
new file mode 100644
index 0000000..330f568
--- /dev/null
+++ b/src/genomic_features/annotate.py
@@ -0,0 +1,114 @@
+import warnings
+
+import pandas as pd
+
+
+def annotate(
+ adata_var: pd.DataFrame,
+ annotation_df: pd.DataFrame,
+ var_on: str = None,
+ annotation_on: str = None,
+) -> pd.DataFrame:
+ """
+ Annotate variables from AnnData object with a DataFrame of annotations.
+
+ Parameters
+ ----------
+ adata_var
+ The AnnData.var DataFrame.
+ annotation_df
+ The DataFrame of annotations (e.g. output of `EnsemblDB.genes()`).
+ var_on
+ The column name in `adata_var` to join var_on (useful if geneIDs are not stored in `adata.var_names`).
+ If `None`, tables are joined based var_on index of `adata_var`.
+ annotation_on
+ The column name in `annotation_df` to join var_on. If `None`, the column name is inferred from the
+ `annotation_df` DataFrame (i.e. the column with unique values ending in `_id`).
+
+ Returns
+ -------
+ annotated_var
+ The annotated AnnData.var DataFrame.
+ """
+ # Pick column with IDs in annotation table
+ if annotation_on is None:
+ annotation_on = [
+ i
+ for i in annotation_df.columns
+ if annotation_df[i].is_unique and i.endswith("_id")
+ ]
+ if len(annotation_on) == 0:
+ raise ValueError(
+ "No unique ID column found in annotation_df - specify ID column with `annotation_on` parameter"
+ )
+ if len(annotation_on) > 1:
+ raise ValueError(
+ "Multiple unique ID columns found in annotation_df - specify ID column with `annotation_on` parameter"
+ )
+ else:
+ annotation_on = annotation_on[0]
+ else:
+ if not annotation_df[annotation_on].is_unique:
+ raise ValueError(f"Column {annotation_on} does not contain unique IDs")
+
+ adata_var_merge = adata_var.copy()
+
+ # Check for common columns between adata_var and annotation_df
+ # (these will be overwritten by the merge)
+ common_cols = annotation_df.columns[
+ annotation_df.columns.isin(adata_var_merge.columns)
+ ].tolist()
+ if len(common_cols) > 0:
+ warnings.warn(
+ f"Columns {common_cols} are present in both adata_var and annotation_df - these will be overwritten",
+ stacklevel=2,
+ )
+ adata_var_merge.drop(common_cols, axis=1, inplace=True)
+
+ # Pick IDs in adata_var table
+ if var_on is None:
+ assert (
+ adata_var_merge.index.is_unique
+ ), "var_names do not contain unique IDs - specify ID column with `var_on` parameter"
+ index_name = adata_var_merge.index.name
+ adata_var_merge = adata_var_merge.reset_index(names="var_names")
+ else:
+ if var_on not in adata_var_merge.columns:
+ raise ValueError(
+ "Column specified by `var_on` does not exist in adata_var_merge"
+ )
+ if not adata_var_merge[var_on].is_unique:
+ raise ValueError("Column specified by `var_on` does not contain unique IDs")
+ adata_var_merge["var_names"] = adata_var_merge[var_on].copy()
+
+ # Merge
+ annotated_var = (
+ adata_var_merge.reset_index()
+ .merge(
+ annotation_df, how="left", right_on=[annotation_on], left_on=["var_names"]
+ )
+ .set_index("index")
+ )
+
+ # Retain order and var_names
+ if var_on is None:
+ annotated_var = annotated_var.set_index("var_names")
+ annotated_var = annotated_var.loc[adata_var_merge["var_names"]]
+ annotated_var.index.name = index_name
+ else:
+ annotated_var = annotated_var.loc[adata_var_merge.index]
+ annotated_var = annotated_var.drop("var_names", axis=1)
+
+ # Check for missing genes
+ missing_vars = annotated_var.index[
+ annotated_var[annotation_df.drop([annotation_on], axis=1).columns]
+ .isna()
+ .all(axis=1)
+ ]
+ if len(missing_vars) > 0:
+ warnings.warn(
+ f"Missing annotations for {len(missing_vars)}/{annotated_var.shape[0]} vars",
+ stacklevel=2,
+ )
+
+ return annotated_var
diff --git a/tests/test_annotate_anndata.py b/tests/test_annotate_anndata.py
new file mode 100644
index 0000000..481e73f
--- /dev/null
+++ b/tests/test_annotate_anndata.py
@@ -0,0 +1,92 @@
+import numpy as np
+import pandas as pd
+import pytest
+from anndata import AnnData
+
+import genomic_features as gf
+
+
+@pytest.fixture(scope="module")
+def genes():
+ return gf.ensembl.annotation("Hsapiens", 108).genes()
+
+
+@pytest.fixture(scope="module")
+def adata():
+ return AnnData(
+ X=np.array([[1, 2, 3], [4, 5, 6]]),
+ obs=pd.DataFrame(index=["cell1", "cell2"]),
+ var=pd.DataFrame(
+ index=["ENSG00000278862", "ENSG00000212694", "ENSG00000113721"]
+ ),
+ )
+
+
+@pytest.fixture(scope="module")
+def annotated_adata_var(adata, genes):
+ return gf.annotate(adata.var, genes)
+
+
+def test_output(annotated_adata_var):
+ assert isinstance(annotated_adata_var, pd.DataFrame)
+
+
+def test_size(annotated_adata_var, adata):
+ assert annotated_adata_var.shape[0] == adata.n_vars
+
+
+def test_var_names(annotated_adata_var, adata):
+ assert "var_names" not in annotated_adata_var
+ assert annotated_adata_var.index.equals(adata.var.index)
+ assert annotated_adata_var.index.name == adata.var.index.name
+
+
+def test_anndata_changes(adata, genes):
+ gf.annotate(adata.var, genes)
+ assert "gene_name" not in adata.var
+ adata.var["gene_ids"] = adata.var_names.copy()
+ gf.annotate(adata.var, genes, var_on="gene_ids")
+ assert "gene_name" not in adata.var
+
+
+def test_missing_genes(genes):
+ # Test missing genes get NAs
+ adata_custom_genes = AnnData(
+ X=np.array([[1, 2, 3, 5], [4, 5, 6, 8]]),
+ obs=pd.DataFrame(index=["cell1", "cell2"]),
+ var=pd.DataFrame(
+ index=["ENSG00000278862", "ENSG00000212694", "ENSG00000113721", "GENE1"]
+ ),
+ )
+ with pytest.warns(UserWarning):
+ annotated_adata_var = gf.annotate(adata_custom_genes.var, genes)
+ assert annotated_adata_var.loc["GENE1"][genes.columns].isna().all()
+
+
+def test_unique_ids(genes, adata):
+ adata_duplicate_genes = AnnData(
+ X=np.array([[1, 2, 3, 5], [4, 5, 6, 8]]),
+ obs=pd.DataFrame(index=["cell1", "cell2"]),
+ var=pd.DataFrame(
+ index=[
+ "ENSG00000278862",
+ "ENSG00000212694",
+ "ENSG00000113721",
+ "ENSG00000212694",
+ ]
+ ),
+ )
+ with pytest.raises(AssertionError):
+ gf.annotate(adata_duplicate_genes.var, genes)
+ with pytest.raises(ValueError):
+ gf.annotate(adata.var, genes, annotation_on="gene_name")
+
+
+def test_clashing_columns(adata, genes):
+ # Test that common columns are dropped
+ adata.var["gene_biotype"] = "foo"
+ adata.var["seq_name"] = "chr1"
+ with pytest.warns(UserWarning, match=r"gene_biotype.+seq_name"):
+ annotated_var = gf.annotate(adata.var, genes)
+ assert "lncRNA" in annotated_var["gene_biotype"].values
+ assert "12" in annotated_var["seq_name"].values