-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_seq_columns.py
44 lines (34 loc) · 1.22 KB
/
create_seq_columns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pandas as pd
# Functions used to create sequence columns
def sequence_bounds(summit: int, start: int, end: int, length: int):
"""
Calculate the sequence coordinates (bounds) for a given DHS.
https://github.com/meuleman/SynthSeqs/blob/main/make_data/process.py
"""
half = length // 2
if (summit - start) < half:
return start, start + length
elif (end - summit) < half:
return end - length, end
return summit - half, summit + half
def add_sequence_column(df: pd.DataFrame, genome, length: int) -> pd.DataFrame:
"""
Query the reference genome for each DHS and add the raw sequences
to the dataframe.
Parameters
----------
df : pd.DataFrame
The dataframe of DHS annotations and NMF loadings.
genome : ReferenceGenome(DataSource)
A reference genome object to query for sequences.
length : int
Length of a DHS.
https://github.com/meuleman/SynthSeqs/blob/main/make_data/process.py
"""
seqs = []
for rowi, row in df.iterrows():
l, r = sequence_bounds(row['summit'], row['start'], row['end'], length)
seq = genome.sequence(row['seqname'], l, r)
seqs.append(seq)
df['sequence'] = seqs
return df