-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_findRelateds.sh
263 lines (202 loc) · 14.3 KB
/
main_findRelateds.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/bin/bash
## Pedigree analysis
mamba install -c conda-forge ghostscript
mamba install -c conda-forge r-igraph
mamba install -c conda-forge r-kinship2
# Download KING executable in your path.
wget https://www.kingrelatedness.com/Linux-king.tar.gz
tar -xzvf Linux-king.tar.gz
# Define your input file (in PLINK binary format: .bed, .bim, .fam)
INPUT_FILE="AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids" ## end of '3.Prep Array Sets'
# prepare more information from dog profiles
AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.fam
awk 'BEGIN{FS=OFS="\t"}NR==FNR{a[$3]=$4;next}{if(a[$1])print a[$1],$0}' tmp_map/map_id.tab <(tail -n+2 phenotypes/dog_profile.csv | tr ',' '\t') > dog_profile_grls.tab
awk 'BEGIN{FS=OFS="\t"}NR==FNR{a[$1]=1;next}{if(!a[$3])print}' <(tail -n+2 phenotypes/dog_profile.csv | tr ',' '\t') tmp_map/map_id.tab > dog_profile_grls.missing.tab ## all missing are oldies
# Prepare the Plink input files: change chr format & make each sample in a seperate family
plink --bfile ${INPUT_FILE} --chr-set 38 no-xy --allow-extra-chr '0' \
--geno 0.05 --maf 0.01 \
--make-bed --out ${INPUT_FILE}.king
cp ${INPUT_FILE}.king.fam ${INPUT_FILE}.king.fam_temp
awk '{$1=$2;print}' ${INPUT_FILE}.king.fam_temp > ${INPUT_FILE}.king.fam
# Step 1: Run KING to estimate kinship coefficients.
# -b specifies the input PLINK binary file.
#./king -b ${INPUT_FILE}.king.bed --kinship --prefix ${INPUT_FILE}.kingOutput &> king.kinship.log
# Step 2: Predict the pedigree based on kinship estimates.
# This will produce a .ped file with predicted relationships.
./king -b ${INPUT_FILE}.king.bed --related --degree 2 --sexchr 39 --prefix ${INPUT_FILE}.kingRelatedOutput --rplot &> king.related.log
./king -b ${INPUT_FILE}.king.bed --cluster --degree 2 --sexchr 39 --prefix ${INPUT_FILE}.kingClusterOutput --rplot &> king.cluster.log
#./king -b ${INPUT_FILE}.king.bed --build --degree 2 --sexchr 39 --prefix ${INPUT_FILE}.kingBuildOutput --rplot &> king.build.log
#./king -b ${INPUT_FILE}.king.bed --build --degree 2 --sexchr 39 --prefix ${INPUT_FILE}.kingBuildOutput &> king.build.log ## --degree 2 causes core dump
./king -b ${INPUT_FILE}.king.bed --build --sexchr 39 --prefix ${INPUT_FILE}.kingBuildOutput &> king.build.log
## merge outputs from related and cluster
tail -n+2 AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingRelatedOutput.kin0 | cut -f2,4,14 | \
while read id1 id2 rel;do
printf "Related $id1 $id2 $rel " | tr ' ' '\t';
echo "cluster" $(grep -w "$id1" AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingClusterOutputcluster.kin | grep -w "$id2" | cut -f1-3,15) | tr ' ' '\t';done > Related_cluster.tab
cat Related_cluster.tab | awk '{if($9 && $4!=$9)print}' ## perfect match
cat Related_cluster.tab | awk 'BEGIN{FS=OFS="\t"}{if(!$6) $6="-";print $2,$3,$4,$6}' > Related_cluster_short.tab
':
## Python
import pandas as pd
import networkx as nx
import sys
def read_file(file_path):
"""
Reads the tab-separated file and returns a list of edges
using only the first two columns.
"""
data = pd.read_csv(file_path, sep='\t', header=None, usecols=[0, 1])
edges = list(data.itertuples(index=False, name=None))
return edges
def find_connected_components(edges):
"""
Finds and returns the connected components given a list of edges.
"""
G = nx.Graph()
G.add_edges_from(edges)
connected_components = list(nx.connected_components(G))
return connected_components
def main(file_path):
edges = read_file(file_path)
components = find_connected_components(edges)
for i, component in enumerate(components, start=1):
print(f"Component {i}: {component}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <path_to_file>")
else:
file_path = sys.argv[1]
main(file_path)
'
## Save this script as 'connected_components.py'
python connected_components.py Related_cluster_short.tab > Related_cluster_all_conn.txt
cat Related_cluster_short.tab | awk 'BEGIN{FS=OFS="\t"}{if($3!="2nd")print $0}' > Related_cluster_closeRel_short.tab
python connected_components.py Related_cluster_closeRel_short.tab > Related_cluster_closeRel_conn.txt
rclone -v --copy-links copy Related_cluster_short.tab remote_UCDavis_GoogleDr:MAF/forSequencing/related/
rclone -v --copy-links copy Related_cluster_all_conn.txt remote_UCDavis_GoogleDr:MAF/forSequencing/related/
rclone -v --copy-links copy Related_cluster_closeRel_conn.txt remote_UCDavis_GoogleDr:MAF/forSequencing/related/
## The build output
# 1. errors
cat AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingBuildOutputupdateparents.txt | awk '{if($2==$3)print}' ## 6
cat AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingBuildOutputupdateparents.txt | awk '{if($2==$4)print}' ## 11
# 2. confirm FS && PO relationship && fix the issue of the match between a parent and sample ID
cat AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingBuildOutputupdateids.txt | sed 's/KING/FAM/' > AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingBuildOutputupdateids_FAM.txt
cat AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingBuildOutputupdateparents.txt | sed 's/KING/FAM/' | awk 'BEGIN{FS=OFS="\t"}{if($3==$2)$3=0;if($4==$2)$4=0;print $0}' > AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingBuildOutputupdateparents_FAM.txt
awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$2]=$0;next}{if(a[$2])print a[$2];else print "-","-","-","-";if(a[$4])print a[$4];else print "-","-","-","-";}' AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingBuildOutputupdateparents_FAM.txt <(tail -n+2 AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingRelatedOutput.kin0) | paste - - > Related_families.tab
paste Related_cluster_short.tab Related_families.tab > Related_cluster_families.tab
# 2a) PO relationship
cat AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingBuildOutputupdateparents_FAM.txt | awk 'BEGIN{FS=OFS="\t"}{if($2!=$3 && $3 !~ /^[0-9]+$/)print $2,$3;}' > build.po1
cat AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingBuildOutputupdateparents_FAM.txt | awk 'BEGIN{FS=OFS="\t"}{if($2!=$4 && $4 !~ /^[0-9]+$/)print $2,$4;}' > build.po2
cat build.po1 build.po2 | sort | uniq > build.po
grep -w PO Related_cluster_short.tab | while read id1 id2 moreData;do grep -w $id1 build.po | grep -w $id2;done > build.po.confirmed_temp
awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1"."$2]=1;next}{if(a[$2"."$1])print}' build.po.confirmed_temp build.po.confirmed_temp ## 2 pairs of bidirectional PO
cat Related_cluster_families.tab | awk 'BEGIN{FS=OFS="\t"}{if($3=="PO" && ($2==$7 || $2==$8) && ($1!=$11 && $1!=$12))print}' > Related_cluster_families.po.confirmed.tab
cat Related_cluster_families.tab | awk 'BEGIN{FS=OFS="\t"}{if($3=="PO" && ($2!=$7 && $2!=$8) && ($1==$11 || $1==$12))print $2,$1,$3,$4,$9,$10,$11,$12,$5,$6,$7,$8}' >> Related_cluster_families.po.confirmed.tab
awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1"."$2]=1;next}{if(a[$1"."$2])print;}' build.po.confirmed_temp Related_cluster_families.po.confirmed.tab > build.po.confirmed ## 79
awk 'BEGIN{FS=OFS="\t"}NR==FNR{a[$1]=$4;next}{if(!a[$1])a[$1]="-";if(!a[$2])a[$2]="-";print $0,a[$1],a[$2]}' dog_profile_grls.tab build.po.confirmed > build.po.confirmed_withProfile
awk 'BEGIN{FS=OFS="\t"}NR==FNR{a[$4]=$1;next}{if($13=="-")$13=a[$1];if($14=="-")$14=a[$2];print $0}' tmp_map/map_id.tab build.po.confirmed_withProfile > build.po.confirmed_withProfile2
cat build.po.confirmed_withProfile2 | awk '{if($13=="Oldies")print $1;if($14=="Oldies")print $2;}' | wc -l ## 46
cat build.po.confirmed_withProfile2 | awk '{if($13=="Oldies")print $1;if($14=="Oldies")print $2;}' | sort | uniq | wc -l ## 19
# Function to calculate difference in months between two dates
calculate_month_diff() {
local start_date=$1
local end_date=$2
# Extract year and month from the dates
local start_year=$(echo $start_date | cut -d'-' -f1)
local start_month=$(echo $start_date | cut -d'-' -f2)
local end_year=$(echo $end_date | cut -d'-' -f1)
local end_month=$(echo $end_date | cut -d'-' -f2)
# Force Bash to interpret numbers as decimal
start_year=$((10#$start_year))
start_month=$((10#$start_month))
end_year=$((10#$end_year))
end_month=$((10#$end_month))
# Calculate difference in months
local month_diff=$(( (end_year - start_year) * 12 + end_month - start_month ))
echo $month_diff
}
# Read from the file line by line
while read end_date start_date; do
# Calculate and print the month difference
month_diff=$(calculate_month_diff $start_date $end_date)
echo $month_diff
done < <(cat build.po.confirmed_withProfile2 | sed 's/Oldies/2005-01/' | cut -f13,14) > time_diff
paste build.po.confirmed_withProfile2 time_diff | awk '{if($15>0)print $1,$2,$13,$14}' > build.po.confirmed_final
paste build.po.confirmed_withProfile2 time_diff | awk '{if($15<0)print}' > build.po.confirmed_negative
rclone -v --copy-links copy build.po.confirmed_final remote_UCDavis_GoogleDr:MAF/Pedigree/
# 2b) FS relationship (check if they have the same parents ==> FS
cat Related_cluster_families.tab | awk 'BEGIN{FS=OFS="\t"}{if($3=="FS" && $7!="-" && $8!="-" && (($1==$11 || $1==$12) || ($2==$7 || $2==$8)))print}' > build.fs_po.toBeAssessed ## 117 ## Detected as FS but was also reported as PO
rclone -v --copy-links copy build.fs_po.toBeAssessed remote_UCDavis_GoogleDr:MAF/Pedigree/
cat Related_cluster_families.tab | awk 'BEGIN{FS=OFS="\t"}{if($3=="FS" && $7!="-" && $8!="-" && $7==$11 && $8==$12)print}' > build.fs.confirmed ## 4452
cat build.fs.confirmed | cut -f5-8 > build.fs.confirmed_ind && cat build.fs.confirmed | cut -f9-12 >> build.fs.confirmed_ind
cat build.fs.confirmed_ind | sort | uniq | wc -l ## 1757
cat build.fs.confirmed | sort -k7,8 > build.fs.confirmed.sorted
cut -f7,8 build.fs.confirmed.sorted | uniq | wc -l ## 425
mkdir families_ped
cut -f5,7,8 build.fs.confirmed.sorted | uniq | while read fam id1 id2;do
cat build.fs.confirmed.sorted | awk -v id1="$id1" -v id2="$id2" 'BEGIN{FS=OFS="\t"}{if($7==id1 && $8==id2)print}' | cut -f1-5,7,8 > families_ped/$fam.$id1.$id2.ped
#cat build.fs.confirmed | grep -w "$id1" | grep -w "$id2" | cut -f1-5,7,8 > families_ped/$fam.$id1.$id2.ped
done
for f in families_ped/*.ped;do
echo $f
cat $f | awk 'BEGIN{FS=OFS="\t"}{a[$1]+=1;a[$2]+=1;}END{for (i in a)print i,a[i]}'
done > families.consistancy
awk 'BEGIN{FS=OFS="\t"}NR==FNR{a[$1]=$0;next}{if(a[$1])print $2,a[$1];else print $1}' dog_profile_grls.tab families.consistancy > families.consistancy_withProfile
awk 'BEGIN{FS=OFS="\t"}NR==FNR{a[$1]=$0;next}/^families_ped/{print;next}{if(a[$1])print $2,a[$1];else print $2,$1}' dog_profile_grls.tab families.consistancy > families.consistancy_withProfile
# 4. dogs identified as FS by --related/--cluster commands are predicted to have totally different parents by the '--build' command
cat Related_cluster_families.tab | awk 'BEGIN{FS=OFS="\t"}{if($3=="FS" && $7!="-" && $11!="-" && $7!=$11 && $8!=$12)print}'
tail -n+2 AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingRelatedOutput.kin0 | awk 'BEGIN{FS=OFS="\t"}{if($14=="PO"){print $1,$2;print $3,$4;}}' | sort | uniq > Related_PO.lst
#tail -n+2 AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.kingRelatedOutput.kin0 | awk 'BEGIN{FS=OFS="\t"}{if($14!="2nd"){print $1,$2;print $3,$4;}}' | sort | uniq > Related_1st.lst
# Prepare the Plink input files: change chr format & make each sample in a seperate family
plink --bfile ${INPUT_FILE}.king --chr-set 38 no-xy --allow-extra-chr '0' \
--keep Related_PO.lst --maf 0.01 \
--recode --out ${INPUT_FILE}.PO.toSequoia
awk 'BEGIN{FS=OFS="\t"}NR==FNR{a[$3]=$4;next}{if(a[$1])print a[$1],$3}' tmp_map/map_id.tab <(tail -n+2 phenotypes/dog_profile.csv | tr ',' '\t') | cut -d"-" -f1 > sequoia.birthYear
cat ${INPUT_FILE}.king.fam | awk 'BEGIN{OFS="\t"}{if($5==1)print $2,2;else if($5==2)print $2,1;}' > sequoia.gender
echo ID Sex BirthYear | tr ' ' '\t' > sequoia.LHD
awk 'BEGIN{FS=OFS="\t"}NR==FNR{a[$1]=$0;next}{if(a[$1])print a[$1],$2}' sequoia.gender sequoia.birthYear >> sequoia.LHD
> R
install.packages("sequoia")
library(sequoia)
input="AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.PO.toSequoia.ped"
GenoM_name="AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.po.sequoia"
GenoConvert(InFile = input, InFormat = "ped", OutFile = GenoM_name, OutFormat = "seq")
GenoM <- read.table(GenoM_name, sep = " ", header = FALSE, row.names = 1)
GenoM <- as.matrix(GenoM)
lhd=read.table("sequoia.LHD", header=TRUE)
SeqOUT <- sequoia(GenoM = GenoM,LifeHistData = lhd,Err = 0.01)
# One PO confimred in a cluster
grep -w KING47 Related_cluster_short.tab | awk 'BEGIN{OFS="\t"}{print $1,$1;print $2,$2;}' > Related_KING47.lst
plink --bfile ${INPUT_FILE}.king --chr-set 38 no-xy --allow-extra-chr '0' \
--keep Related_KING47.lst --maf 0.05 --snps-only \
--recode --out ${INPUT_FILE}.PO_KING47.toSequoia
> R
install.packages("sequoia")
library(sequoia)
input="AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.PO_KING47.toSequoia.ped"
GenoM_name="AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.PO_KING47.sequoia"
GenoConvert(InFile = input, InFormat = "ped", OutFile = GenoM_name, OutFormat = "seq")
GenoM <- read.table(GenoM_name, sep = " ", header = FALSE, row.names = 1)
GenoM <- as.matrix(GenoM)
SeqOUT <- sequoia(GenoM = GenoM,Err = 0.01)
#
cat Related_cluster_families.tab | awk 'BEGIN{FS=OFS="\t"}{if($3=="FS" && $7!="-" && $7 !~ /^[0-9]+$/ && $8!="-" && $8 !~ /^[0-9]+$/ && $7==$11 && $8==$12)print}' | grep FAM24 | awk 'BEGIN{OFS="\t"}{print $1,$1;print $2,$2;print $7,$7;print $8,$8;}' | sort | uniq > Related_FAM24.lst
plink --bfile ${INPUT_FILE}.king --chr-set 38 no-xy --allow-extra-chr '0' \
--keep Related_FAM24.lst --geno 0 --maf 0.05 --snps-only \
--recode --out ${INPUT_FILE}.FS_FAM24.toSequoia
> R
install.packages("sequoia")
library(sequoia)
input="AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.FS_FAM24.toSequoia.ped"
GenoM_name="AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.FS_FAM24.sequoia"
GenoConvert(InFile = input, InFormat = "ped", OutFile = GenoM_name, OutFormat = "seq")
GenoM <- read.table(GenoM_name, sep = " ", header = FALSE, row.names = 1)
GenoM <- as.matrix(GenoM)
SeqOUT <- sequoia(GenoM = GenoM,Err = 0.005)
####
cat AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.PO.toSequoia.ped | cut -d" " -f1-100 > temp.ped ## ie. 47 markers
head -n194 AxiomGT1v2.comp_merge.deDup.sexConfirm.public_ids.PO.toSequoia.map > temp.map
> R
library(sequoia)
input="temp.ped"
GenoConvert(InFile = input, InFormat = "ped", OutFile = "temp.sequoia", OutFormat = "seq")