Merge branch 'master' of git://github.com/hoondy/starrpeaker

gersteinlab · Aug 12, 2021 · e74275f · e74275f
2 parents 754392d + 0efa023
commit e74275f
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,18 @@
 # STARRPeaker
 Uniform processing pipeline and peak caller for STARR-seq data
 
+## Changelog
+### v1.1
+- Updated the final peak call (BED6+) ENCODE specification (https://www.encodeproject.org/documents/9f5d2b5a-bd29-4983-9c01-fab4ab8b5ea2/)
+- In specific, fold change is changed to log2 fold change
+- In specific, input coverage is reported along with output coverage
+
+### v1.0
+- Updated documentation for ENCODE release (https://bit.ly/whg-starr-seq)
+
+### v1.0-rc
+- Release candidate with early version of documentation
+
 ## Dependencies (version tested)
 * Python 2.7 (v2.7.15)
 * pysam (v0.15.3)
@@ -120,7 +132,22 @@ starrpeaker --prefix <prefix for output files> --chromsize <hg38.chrom.sizes> --
 * *prefix*.peak.pval.bw: P-value track in bigWig format (-log10)
 * *prefix*.peak.qval.bw: Q-value track in bigWig format (-log10)
 
-## Final Peak Call Format (BED6+4)
+## Final Peak Call Format (v1.1 and above; BED6+5)
+* Column 1: Chromosome
+* Column 2: Start position
+* Column 3: End position
+* Column 4: Name (peak rank based on score, 1 being the highest rank)
+* Column 5: Score (integer value of "100 * fold change", maxed at 1000 per BED format specification)
+* Column 6: Strand
+* Column 7: Log2 Fold change (normalized output/input ratio, in log2 space)
+* Column 8: Input fragment coverage (total fragments across/within replicate(s))
+* Column 9: Output fragment coverage (total fragments across/within replicate(s))
+* Column 10: -log10 of P-value
+* Column 11: -log10 of Q-value (Benjamini-Hochberg False Discovery Rate, FDR)
+
+*ENCODE MPRA/STARR-seq BED6+5 common file format: https://www.encodeproject.org/documents/9f5d2b5a-bd29-4983-9c01-fab4ab8b5ea2/*
+
+## Final Peak Call Format (up to v1.0; BED6+4)
 * Column 1: Chromosome
 * Column 2: Start position
 * Column 3: End position

diff --git a/starrpeaker/core.py b/starrpeaker/core.py
@@ -745,19 +745,23 @@ def call_peak(prefix, bedFile, bctFile, chromSize, bwFile, covFile=None, thresho
 
     ### merge peak
     print("[%s] Merge peaks" % (timestamp()))
-    pybedtools.BedTool(prefix + ".peak.centered.bed").merge(c=[4, 6, 7, 8], o=["max", "max", "max", "max"]).saveas(prefix + ".peak.centered.merged.bed")
+    # pybedtools.BedTool(prefix + ".peak.centered.bed").merge(c=[4, 6, 7, 8], o=["max", "max", "max", "max"]).saveas(prefix + ".peak.centered.merged.bed")
+    pybedtools.BedTool(prefix + ".peak.centered.bed").merge(c=[4, 5, 6, 7, 8], o=["max", "max", "max", "max", "max"]).saveas(prefix + ".peak.centered.merged.bed")
 
     ### finalize peak
     print("[%s] Finalize peaks" % (timestamp()))
     peak = pd.read_csv(prefix + ".peak.centered.merged.bed", sep='\t', header=None)
-    peak.columns = ['chr', 'start', 'end', 'fc', 'cov', 'nlog10pval', 'nlog10qval']
+    # peak.columns = ['chr', 'start', 'end', 'fc', 'cov', 'nlog10pval', 'nlog10qval']
+    peak.columns = ['chr', 'start', 'end', 'fc', 'icov', 'ocov', 'nlog10pval', 'nlog10qval']
+    peak['log2fc'] = np.log2(peak['fc'])
     peak['idx'] = peak.index
     peak['strand'] = "."
     peak['score'] = [min(int(round(x)), 1000) for x in peak['fc'] * 100]
     peak = peak.sort_values(by=['fc'], ascending=False)
     peak['name'] = ['peak_' + str(x) for x in peak.reset_index().index + 1]
     peak = peak.sort_values(by=['idx'])
-    final = peak[['chr', 'start', 'end', 'name', 'score', 'strand', 'fc', 'cov', 'nlog10pval', 'nlog10qval']]
+    # final = peak[['chr', 'start', 'end', 'name', 'score', 'strand', 'fc', 'cov', 'nlog10pval', 'nlog10qval']]
+    final = peak[['chr', 'start', 'end', 'name', 'score', 'strand', 'log2fc', 'icov', 'ocov', 'nlog10pval', 'nlog10qval']]
     final.to_csv(prefix + '.peak.final.bed', sep='\t', float_format='%.3f', index=False, header=False)
 
     ### remove intermediate peak files