diff --git a/README.md b/README.md index 206fed3..ae97d26 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,18 @@ # STARRPeaker Uniform processing pipeline and peak caller for STARR-seq data +## Changelog +### v1.1 +- Updated the final peak call (BED6+) ENCODE specification (https://www.encodeproject.org/documents/9f5d2b5a-bd29-4983-9c01-fab4ab8b5ea2/) +- In specific, fold change is changed to log2 fold change +- In specific, input coverage is reported along with output coverage + +### v1.0 +- Updated documentation for ENCODE release (https://bit.ly/whg-starr-seq) + +### v1.0-rc +- Release candidate with early version of documentation + ## Dependencies (version tested) * Python 2.7 (v2.7.15) * pysam (v0.15.3) @@ -120,7 +132,22 @@ starrpeaker --prefix --chromsize -- * *prefix*.peak.pval.bw: P-value track in bigWig format (-log10) * *prefix*.peak.qval.bw: Q-value track in bigWig format (-log10) -## Final Peak Call Format (BED6+4) +## Final Peak Call Format (v1.1 and above; BED6+5) +* Column 1: Chromosome +* Column 2: Start position +* Column 3: End position +* Column 4: Name (peak rank based on score, 1 being the highest rank) +* Column 5: Score (integer value of "100 * fold change", maxed at 1000 per BED format specification) +* Column 6: Strand +* Column 7: Log2 Fold change (normalized output/input ratio, in log2 space) +* Column 8: Input fragment coverage (total fragments across/within replicate(s)) +* Column 9: Output fragment coverage (total fragments across/within replicate(s)) +* Column 10: -log10 of P-value +* Column 11: -log10 of Q-value (Benjamini-Hochberg False Discovery Rate, FDR) + +*ENCODE MPRA/STARR-seq BED6+5 common file format: https://www.encodeproject.org/documents/9f5d2b5a-bd29-4983-9c01-fab4ab8b5ea2/* + +## Final Peak Call Format (up to v1.0; BED6+4) * Column 1: Chromosome * Column 2: Start position * Column 3: End position diff --git a/starrpeaker/core.py b/starrpeaker/core.py index 8e64953..55a42bf 100644 --- a/starrpeaker/core.py +++ b/starrpeaker/core.py @@ -745,19 +745,23 @@ def call_peak(prefix, bedFile, bctFile, chromSize, bwFile, covFile=None, thresho ### merge peak print("[%s] Merge peaks" % (timestamp())) - pybedtools.BedTool(prefix + ".peak.centered.bed").merge(c=[4, 6, 7, 8], o=["max", "max", "max", "max"]).saveas(prefix + ".peak.centered.merged.bed") + # pybedtools.BedTool(prefix + ".peak.centered.bed").merge(c=[4, 6, 7, 8], o=["max", "max", "max", "max"]).saveas(prefix + ".peak.centered.merged.bed") + pybedtools.BedTool(prefix + ".peak.centered.bed").merge(c=[4, 5, 6, 7, 8], o=["max", "max", "max", "max", "max"]).saveas(prefix + ".peak.centered.merged.bed") ### finalize peak print("[%s] Finalize peaks" % (timestamp())) peak = pd.read_csv(prefix + ".peak.centered.merged.bed", sep='\t', header=None) - peak.columns = ['chr', 'start', 'end', 'fc', 'cov', 'nlog10pval', 'nlog10qval'] + # peak.columns = ['chr', 'start', 'end', 'fc', 'cov', 'nlog10pval', 'nlog10qval'] + peak.columns = ['chr', 'start', 'end', 'fc', 'icov', 'ocov', 'nlog10pval', 'nlog10qval'] + peak['log2fc'] = np.log2(peak['fc']) peak['idx'] = peak.index peak['strand'] = "." peak['score'] = [min(int(round(x)), 1000) for x in peak['fc'] * 100] peak = peak.sort_values(by=['fc'], ascending=False) peak['name'] = ['peak_' + str(x) for x in peak.reset_index().index + 1] peak = peak.sort_values(by=['idx']) - final = peak[['chr', 'start', 'end', 'name', 'score', 'strand', 'fc', 'cov', 'nlog10pval', 'nlog10qval']] + # final = peak[['chr', 'start', 'end', 'name', 'score', 'strand', 'fc', 'cov', 'nlog10pval', 'nlog10qval']] + final = peak[['chr', 'start', 'end', 'name', 'score', 'strand', 'log2fc', 'icov', 'ocov', 'nlog10pval', 'nlog10qval']] final.to_csv(prefix + '.peak.final.bed', sep='\t', float_format='%.3f', index=False, header=False) ### remove intermediate peak files