Coverage for lib/optwps.py: 10%
125 statements
« prev ^ index » next coverage.py v7.11.1, created at 2025-11-09 22:36 +0100
« prev ^ index » next coverage.py v7.11.1, created at 2025-11-09 22:36 +0100
1#!/usr/bin/env python
2"""
3Fast Window Protection Score (WPS) Calculator for Cell-Free DNA Analysis.
5This module provides efficient computation of Window Protection Scores from BAM files,
6designed for analyzing cell-free DNA fragmentation patterns and nucleosome positioning.
8The WPS algorithm identifies protected genomic regions by analyzing DNA fragment
9coverage patterns. It counts fragments that span versus fragments whose endpoints
10fall within a protection window around each genomic position.
12Example:
13 Basic usage::
15 from lib.optwps import WPS
17 wps_calc = WPS(protection_size=120)
18 wps_calc.run(bamfile='input.bam', out_filepath='output.tsv')
20Note:
21 - Op BAM Description
22 - M 0 alignment match (can be a sequence match or mismatch)
23 - I 1 insertion to the reference
24 - D 2 deletion from the reference
25 - N 3 skipped region from the reference
26 - S 4 soft clipping (clipped sequences present in SEQ)
27 - H 5 hard clipping (clipped sequences NOT present in SEQ)
28 - P 6 padding (silent deletion from padded reference)
29 - = 7 sequence match
30 - X 8 sequence mismatch
31 - pysam always uses 0-based coordinates, converting from 1-based of the sam files!
32"""
34import os
35import pysam
36import random
37import numpy as np
39from lib.utils import exopen, is_soft_clipped, ref_aln_length
40from tqdm.auto import tqdm
43class ROIGenerator:
44 """
45 Generate regions of interest (ROI) for processing.
47 This class generates genomic regions either from a BED file or from the entire
48 genome in the BAM file. Regions are yielded in chunks for memory-efficient processing.
50 Args:
51 bed_file (str, optional): Path to BED file containing regions to process.
52 If None, the entire genome will be processed. Default: None
53 chunk_size (int, optional): Size of chunks in base pairs for processing.
54 Default: 1e8 (100 megabases)
55 """
57 def __init__(self, bed_file=None, chunk_size=1e8):
58 self.bed_file = bed_file
59 self.chunk_size = chunk_size
61 def regions(self, bam_file=None):
62 """
63 Generate regions for processing.
65 Yields genomic regions either from a BED file or from the entire genome
66 referenced in the BAM file. Regions are chunked according to chunk_size.
68 Args:
69 bam_file (str, optional): Path to BAM file. Required if no BED file
70 is provided to determine genome regions. Default: None
72 Yields:
73 tuple: (chromosome, start, end) for each region chunk
75 Raises:
76 ValueError: If neither bed_file nor bam_file can provide regions
77 """
78 if (self.bed_file is None) or (not os.path.exists(self.bed_file)):
79 input_file = pysam.Samfile(bam_file, "rb")
80 nchunks = sum(
81 (input_file.get_reference_length(chrom) - 1) // self.chunk_size + 1
82 for chrom in input_file.references
83 )
84 iterator = tqdm(total=nchunks, desc="Processing genome regions")
85 for chrom in input_file.references:
86 chrom_length = input_file.get_reference_length(chrom)
87 region_start = 0
88 while region_start < chrom_length:
89 region_end = min(region_start + self.chunk_size, chrom_length)
90 yield chrom, region_start, region_end
91 region_start = region_end
92 iterator.update(1)
93 iterator.close()
94 else:
95 # read number of lines in bed file
96 nlines = sum(1 for _ in exopen(self.bed_file, "r"))
97 with exopen(self.bed_file, "r") as bed:
98 for line in tqdm(bed, total=nlines, desc="Processing BED regions"):
99 chrom, start, end = line.split()[:3]
100 chunk_start = int(start)
101 chunk_end = min(chunk_start + self.chunk_size, int(end))
102 while chunk_start < int(end):
103 yield chrom, chunk_start, chunk_end
104 chunk_start = chunk_end
105 chunk_end = min(chunk_start + self.chunk_size, int(end))
108class WPS:
109 """
110 Window Protection Score (WPS) calculator for cell-free DNA analysis.
112 This class computes Window Protection Scores from aligned sequencing reads in BAM format.
113 WPS quantifies the protection of DNA fragments around each genomic position, useful for
114 identifying nucleosome positioning and other protected regions in cell-free DNA.
116 The algorithm calculates:
117 - Outside score: fragments that completely span the protection window
118 - Inside score: fragment endpoints falling within the protection window
119 - WPS = outside - inside
121 Args:
122 bed_file (str, optional): Path to BED file with regions to process.
123 If None, processes entire genome. Default: None
124 protection_size (int, optional): Total protection window size in base pairs.
125 This value is divided by 2 to get the window on each side of the position.
126 Default: 120
127 min_insert_size (int, optional): Minimum insert/fragment size to include.
128 If None, no minimum filter applied. Default: None
129 max_insert_size (int, optional): Maximum insert/fragment size to include.
130 If None, no maximum filter applied. Default: None
131 valid_chroms (set, optional): Set of valid chromosome names to process.
132 Default: chromosomes 1-22, X, Y
133 chunk_size (float, optional): Region chunk size for processing.
134 Default: 1e8 (100 Mb)
136 Attributes:
137 bed_file (str): Path to BED file or None
138 protection_size (int): Half of the protection window size
139 valid_chroms (set): Set of valid chromosome names
140 min_insert_size (int): Minimum fragment size filter
141 max_insert_size (int): Maximum fragment size filter
142 chunk_size (float): Chunk size for processing
143 roi_generator (ROIGenerator): Region generator instance
145 Example:
146 >>> wps = WPS(protection_size=120, min_insert_size=120, max_insert_size=180)
147 >>> wps.run(bamfile='sample.bam', out_filepath='wps_output.tsv')
149 Note:
150 - Automatically filters duplicate, QC-failed, and unmapped reads
151 - Handles both paired-end and single-end sequencing data
152 - Supports downsampling for high-coverage samples
153 """
155 def __init__(
156 self,
157 bed_file=None,
158 protection_size=120,
159 min_insert_size=None,
160 max_insert_size=None,
161 valid_chroms=set(map(str, list(range(1, 23)) + ["X", "Y"])),
162 chunk_size=1e8,
163 ):
164 self.bed_file = bed_file
165 self.protection_size = protection_size // 2
166 if valid_chroms is not None:
167 self.valid_chroms = [x.replace("chr", "") for x in valid_chroms]
168 else:
169 self.valid_chroms = None
170 self.min_insert_size = min_insert_size
171 self.max_insert_size = max_insert_size
172 self.chunk_size = chunk_size
173 self.roi_generator = ROIGenerator(bed_file=self.bed_file)
175 def __call__(self, *args, **kwargs):
176 return self.run(*args, **kwargs)
178 def run(
179 self,
180 bamfile,
181 out_filepath=None,
182 downsample_ratio=None,
183 verbose_output=False,
184 ):
185 """
186 Calculate Window Protection Score for all regions and write to file.
188 Processes the BAM file to compute WPS values for each genomic position
189 in the specified regions (or entire genome). Results are written to a
190 tab-separated output file.
192 Args:
193 bamfile (str): Path to input BAM file (must be sorted and indexed)
194 out_filepath (str): Path to output TSV file, or stdout (default)
195 downsample_ratio (float, optional): Fraction of reads to randomly keep
196 (0.0 to 1.0). Useful for high-coverage samples. Default: None (no downsampling)
198 Returns:
199 None: Results are written directly to out_filepath
201 Output Format:
202 Tab-separated file with columns:
203 1. chromosome: Chromosome name (without 'chr' prefix)
204 2. start: Start position (0-based)
205 3. end: End position (1-based, start + 1)
206 4. outside: Count of fragments spanning the protection window (if verbose_output)
207 5. inside: Count of fragment endpoints in protection window (if verbose_output)
208 6. wps: Window Protection Score (outside - inside)
210 Raises:
211 FileNotFoundError: If bamfile does not exist
212 ValueError: If downsample_ratio is not between 0 and 1
214 Example:
215 >>> wps = WPS()
216 >>> wps.run(bamfile='input.bam', out_filepath='output.tsv')
217 >>> # With downsampling
218 >>> wps.run(bamfile='input.bam', out_filepath='output.tsv',
219 ... downsample_ratio=0.5)
220 """
221 if out_filepath is None:
222 out_filepath = "stdout"
223 input_file = pysam.Samfile(bamfile, "rb")
224 prefix = (
225 "chr" if any(r.startswith("chr") for r in input_file.references) else ""
226 )
227 with exopen(out_filepath, "w") as outfile:
229 for chrom, start, end in self.roi_generator.regions(bam_file=bamfile):
230 if "chr" in chrom:
231 chrom = chrom.replace("chr", "")
232 if self.valid_chroms is not None and chrom not in self.valid_chroms:
233 continue
234 try:
235 regionStart, regionEnd = int(start), int(end)
236 except ValueError:
237 continue
239 starts = []
240 ends = []
241 for read in input_file.fetch(
242 prefix + chrom,
243 max(0, regionStart - self.protection_size - 1),
244 regionEnd + self.protection_size + 1,
245 ):
246 if read.is_duplicate or read.is_qcfail or read.is_unmapped:
247 continue
248 if is_soft_clipped(read.cigartuples):
249 continue
251 if read.is_paired:
252 if read.mate_is_unmapped:
253 continue
254 if read.rnext != read.tid:
255 continue
256 if read.is_read1 or (
257 read.is_read2
258 and read.pnext + read.qlen
259 < regionStart - self.protection_size - 1
260 ):
261 if read.isize == 0:
262 continue
263 if (
264 downsample_ratio is not None
265 and random.random() >= downsample_ratio
266 ):
267 continue
268 lseq = abs(read.isize)
269 if (
270 self.min_insert_size is not None
271 and lseq < self.min_insert_size
272 ):
273 continue
274 if (
275 self.max_insert_size is not None
276 and lseq > self.max_insert_size
277 ):
278 continue
279 rstart = min(read.pos, read.pnext)
280 rend = rstart + lseq - 1
281 starts.append(rstart)
282 ends.append(rend)
283 else:
284 if (
285 downsample_ratio is not None
286 and random.random() >= downsample_ratio
287 ):
288 continue
289 rstart = read.pos
290 lseq = ref_aln_length(read.cigartuples)
291 if self.min_insert_size is not None and (
292 (lseq < self.min_insert_size)
293 or (lseq > self.max_insert_size)
294 ):
295 continue
296 rend = rstart + lseq - 1 # end included
297 starts.append(rstart)
298 ends.append(rend)
299 n = regionEnd - regionStart + 1
300 if len(starts) > 0:
301 starts = np.array(starts)
302 ends = np.array(ends)
304 # Fragments fully spanning the window boundaries
305 span_start = starts + self.protection_size - regionStart
306 span_end = ends - self.protection_size - regionStart + 2
307 valid = span_end >= span_start
308 outside = np.zeros(n + 2, dtype=int)
309 np.add.at(
310 outside,
311 np.clip(span_start[valid] + 1, 0, n + 1),
312 1,
313 )
314 np.add.at(
315 outside,
316 np.clip(span_end[valid], 0, n + 1),
317 -1,
318 )
319 np.add.at(
320 outside,
321 np.clip(span_start[~valid] + 1, 0, n + 1),
322 -1,
323 )
324 np.add.at(
325 outside,
326 np.clip(span_end[~valid], 0, n + 1),
327 1,
328 )
330 outside_cum = np.cumsum(outside)[:-1]
332 # Fragments whose endpoints fall inside windows
333 all_ends = np.concatenate([starts, ends]) - regionStart
334 left = np.clip(all_ends - self.protection_size + 2, 0, n + 1)
335 right = np.clip(all_ends + self.protection_size + 1, 0, n + 1)
336 inside = np.zeros(n + 2, dtype=int)
337 np.add.at(inside, left, 1)
338 np.add.at(inside, right, -1)
339 inside_cum = np.cumsum(inside)[:-1]
341 wps = outside_cum - inside_cum
342 else:
343 outside_cum = np.zeros(n, dtype=int)
344 inside_cum = np.zeros(n, dtype=int)
345 wps = np.zeros(n, dtype=int)
346 for i in range(regionEnd - regionStart + 1):
347 if verbose_output:
348 outfile.write(
349 "%s\t%d\t%d\t%d\t%d\t%d\n"
350 % (
351 chrom,
352 regionStart + i,
353 regionStart + i + 1, # add end coordinate
354 outside_cum[i],
355 inside_cum[i],
356 wps[i],
357 )
358 )
359 else:
360 outfile.write(
361 "%s\t%d\t%d\t%d\n"
362 % (
363 chrom,
364 regionStart + i,
365 regionStart + i + 1, # add end coordinate
366 wps[i],
367 )
368 )