Coverage for lib/optwps.py: 10%

125 statements  

« prev     ^ index     » next       coverage.py v7.11.1, created at 2025-11-09 22:36 +0100

1#!/usr/bin/env python 

2""" 

3Fast Window Protection Score (WPS) Calculator for Cell-Free DNA Analysis. 

4 

5This module provides efficient computation of Window Protection Scores from BAM files, 

6designed for analyzing cell-free DNA fragmentation patterns and nucleosome positioning. 

7 

8The WPS algorithm identifies protected genomic regions by analyzing DNA fragment 

9coverage patterns. It counts fragments that span versus fragments whose endpoints 

10fall within a protection window around each genomic position. 

11 

12Example: 

13 Basic usage:: 

14 

15 from lib.optwps import WPS 

16 

17 wps_calc = WPS(protection_size=120) 

18 wps_calc.run(bamfile='input.bam', out_filepath='output.tsv') 

19 

20Note: 

21 - Op BAM Description 

22 - M 0 alignment match (can be a sequence match or mismatch) 

23 - I 1 insertion to the reference 

24 - D 2 deletion from the reference 

25 - N 3 skipped region from the reference 

26 - S 4 soft clipping (clipped sequences present in SEQ) 

27 - H 5 hard clipping (clipped sequences NOT present in SEQ) 

28 - P 6 padding (silent deletion from padded reference) 

29 - = 7 sequence match 

30 - X 8 sequence mismatch 

31 - pysam always uses 0-based coordinates, converting from 1-based of the sam files! 

32""" 

33 

34import os 

35import pysam 

36import random 

37import numpy as np 

38 

39from lib.utils import exopen, is_soft_clipped, ref_aln_length 

40from tqdm.auto import tqdm 

41 

42 

43class ROIGenerator: 

44 """ 

45 Generate regions of interest (ROI) for processing. 

46 

47 This class generates genomic regions either from a BED file or from the entire 

48 genome in the BAM file. Regions are yielded in chunks for memory-efficient processing. 

49 

50 Args: 

51 bed_file (str, optional): Path to BED file containing regions to process. 

52 If None, the entire genome will be processed. Default: None 

53 chunk_size (int, optional): Size of chunks in base pairs for processing. 

54 Default: 1e8 (100 megabases) 

55 """ 

56 

57 def __init__(self, bed_file=None, chunk_size=1e8): 

58 self.bed_file = bed_file 

59 self.chunk_size = chunk_size 

60 

61 def regions(self, bam_file=None): 

62 """ 

63 Generate regions for processing. 

64 

65 Yields genomic regions either from a BED file or from the entire genome 

66 referenced in the BAM file. Regions are chunked according to chunk_size. 

67 

68 Args: 

69 bam_file (str, optional): Path to BAM file. Required if no BED file 

70 is provided to determine genome regions. Default: None 

71 

72 Yields: 

73 tuple: (chromosome, start, end) for each region chunk 

74 

75 Raises: 

76 ValueError: If neither bed_file nor bam_file can provide regions 

77 """ 

78 if (self.bed_file is None) or (not os.path.exists(self.bed_file)): 

79 input_file = pysam.Samfile(bam_file, "rb") 

80 nchunks = sum( 

81 (input_file.get_reference_length(chrom) - 1) // self.chunk_size + 1 

82 for chrom in input_file.references 

83 ) 

84 iterator = tqdm(total=nchunks, desc="Processing genome regions") 

85 for chrom in input_file.references: 

86 chrom_length = input_file.get_reference_length(chrom) 

87 region_start = 0 

88 while region_start < chrom_length: 

89 region_end = min(region_start + self.chunk_size, chrom_length) 

90 yield chrom, region_start, region_end 

91 region_start = region_end 

92 iterator.update(1) 

93 iterator.close() 

94 else: 

95 # read number of lines in bed file 

96 nlines = sum(1 for _ in exopen(self.bed_file, "r")) 

97 with exopen(self.bed_file, "r") as bed: 

98 for line in tqdm(bed, total=nlines, desc="Processing BED regions"): 

99 chrom, start, end = line.split()[:3] 

100 chunk_start = int(start) 

101 chunk_end = min(chunk_start + self.chunk_size, int(end)) 

102 while chunk_start < int(end): 

103 yield chrom, chunk_start, chunk_end 

104 chunk_start = chunk_end 

105 chunk_end = min(chunk_start + self.chunk_size, int(end)) 

106 

107 

108class WPS: 

109 """ 

110 Window Protection Score (WPS) calculator for cell-free DNA analysis. 

111 

112 This class computes Window Protection Scores from aligned sequencing reads in BAM format. 

113 WPS quantifies the protection of DNA fragments around each genomic position, useful for 

114 identifying nucleosome positioning and other protected regions in cell-free DNA. 

115 

116 The algorithm calculates: 

117 - Outside score: fragments that completely span the protection window 

118 - Inside score: fragment endpoints falling within the protection window 

119 - WPS = outside - inside 

120 

121 Args: 

122 bed_file (str, optional): Path to BED file with regions to process. 

123 If None, processes entire genome. Default: None 

124 protection_size (int, optional): Total protection window size in base pairs. 

125 This value is divided by 2 to get the window on each side of the position. 

126 Default: 120 

127 min_insert_size (int, optional): Minimum insert/fragment size to include. 

128 If None, no minimum filter applied. Default: None 

129 max_insert_size (int, optional): Maximum insert/fragment size to include. 

130 If None, no maximum filter applied. Default: None 

131 valid_chroms (set, optional): Set of valid chromosome names to process. 

132 Default: chromosomes 1-22, X, Y 

133 chunk_size (float, optional): Region chunk size for processing. 

134 Default: 1e8 (100 Mb) 

135 

136 Attributes: 

137 bed_file (str): Path to BED file or None 

138 protection_size (int): Half of the protection window size 

139 valid_chroms (set): Set of valid chromosome names 

140 min_insert_size (int): Minimum fragment size filter 

141 max_insert_size (int): Maximum fragment size filter 

142 chunk_size (float): Chunk size for processing 

143 roi_generator (ROIGenerator): Region generator instance 

144 

145 Example: 

146 >>> wps = WPS(protection_size=120, min_insert_size=120, max_insert_size=180) 

147 >>> wps.run(bamfile='sample.bam', out_filepath='wps_output.tsv') 

148 

149 Note: 

150 - Automatically filters duplicate, QC-failed, and unmapped reads 

151 - Handles both paired-end and single-end sequencing data 

152 - Supports downsampling for high-coverage samples 

153 """ 

154 

155 def __init__( 

156 self, 

157 bed_file=None, 

158 protection_size=120, 

159 min_insert_size=None, 

160 max_insert_size=None, 

161 valid_chroms=set(map(str, list(range(1, 23)) + ["X", "Y"])), 

162 chunk_size=1e8, 

163 ): 

164 self.bed_file = bed_file 

165 self.protection_size = protection_size // 2 

166 if valid_chroms is not None: 

167 self.valid_chroms = [x.replace("chr", "") for x in valid_chroms] 

168 else: 

169 self.valid_chroms = None 

170 self.min_insert_size = min_insert_size 

171 self.max_insert_size = max_insert_size 

172 self.chunk_size = chunk_size 

173 self.roi_generator = ROIGenerator(bed_file=self.bed_file) 

174 

175 def __call__(self, *args, **kwargs): 

176 return self.run(*args, **kwargs) 

177 

178 def run( 

179 self, 

180 bamfile, 

181 out_filepath=None, 

182 downsample_ratio=None, 

183 verbose_output=False, 

184 ): 

185 """ 

186 Calculate Window Protection Score for all regions and write to file. 

187 

188 Processes the BAM file to compute WPS values for each genomic position 

189 in the specified regions (or entire genome). Results are written to a 

190 tab-separated output file. 

191 

192 Args: 

193 bamfile (str): Path to input BAM file (must be sorted and indexed) 

194 out_filepath (str): Path to output TSV file, or stdout (default) 

195 downsample_ratio (float, optional): Fraction of reads to randomly keep 

196 (0.0 to 1.0). Useful for high-coverage samples. Default: None (no downsampling) 

197 

198 Returns: 

199 None: Results are written directly to out_filepath 

200 

201 Output Format: 

202 Tab-separated file with columns: 

203 1. chromosome: Chromosome name (without 'chr' prefix) 

204 2. start: Start position (0-based) 

205 3. end: End position (1-based, start + 1) 

206 4. outside: Count of fragments spanning the protection window (if verbose_output) 

207 5. inside: Count of fragment endpoints in protection window (if verbose_output) 

208 6. wps: Window Protection Score (outside - inside) 

209 

210 Raises: 

211 FileNotFoundError: If bamfile does not exist 

212 ValueError: If downsample_ratio is not between 0 and 1 

213 

214 Example: 

215 >>> wps = WPS() 

216 >>> wps.run(bamfile='input.bam', out_filepath='output.tsv') 

217 >>> # With downsampling 

218 >>> wps.run(bamfile='input.bam', out_filepath='output.tsv', 

219 ... downsample_ratio=0.5) 

220 """ 

221 if out_filepath is None: 

222 out_filepath = "stdout" 

223 input_file = pysam.Samfile(bamfile, "rb") 

224 prefix = ( 

225 "chr" if any(r.startswith("chr") for r in input_file.references) else "" 

226 ) 

227 with exopen(out_filepath, "w") as outfile: 

228 

229 for chrom, start, end in self.roi_generator.regions(bam_file=bamfile): 

230 if "chr" in chrom: 

231 chrom = chrom.replace("chr", "") 

232 if self.valid_chroms is not None and chrom not in self.valid_chroms: 

233 continue 

234 try: 

235 regionStart, regionEnd = int(start), int(end) 

236 except ValueError: 

237 continue 

238 

239 starts = [] 

240 ends = [] 

241 for read in input_file.fetch( 

242 prefix + chrom, 

243 max(0, regionStart - self.protection_size - 1), 

244 regionEnd + self.protection_size + 1, 

245 ): 

246 if read.is_duplicate or read.is_qcfail or read.is_unmapped: 

247 continue 

248 if is_soft_clipped(read.cigartuples): 

249 continue 

250 

251 if read.is_paired: 

252 if read.mate_is_unmapped: 

253 continue 

254 if read.rnext != read.tid: 

255 continue 

256 if read.is_read1 or ( 

257 read.is_read2 

258 and read.pnext + read.qlen 

259 < regionStart - self.protection_size - 1 

260 ): 

261 if read.isize == 0: 

262 continue 

263 if ( 

264 downsample_ratio is not None 

265 and random.random() >= downsample_ratio 

266 ): 

267 continue 

268 lseq = abs(read.isize) 

269 if ( 

270 self.min_insert_size is not None 

271 and lseq < self.min_insert_size 

272 ): 

273 continue 

274 if ( 

275 self.max_insert_size is not None 

276 and lseq > self.max_insert_size 

277 ): 

278 continue 

279 rstart = min(read.pos, read.pnext) 

280 rend = rstart + lseq - 1 

281 starts.append(rstart) 

282 ends.append(rend) 

283 else: 

284 if ( 

285 downsample_ratio is not None 

286 and random.random() >= downsample_ratio 

287 ): 

288 continue 

289 rstart = read.pos 

290 lseq = ref_aln_length(read.cigartuples) 

291 if self.min_insert_size is not None and ( 

292 (lseq < self.min_insert_size) 

293 or (lseq > self.max_insert_size) 

294 ): 

295 continue 

296 rend = rstart + lseq - 1 # end included 

297 starts.append(rstart) 

298 ends.append(rend) 

299 n = regionEnd - regionStart + 1 

300 if len(starts) > 0: 

301 starts = np.array(starts) 

302 ends = np.array(ends) 

303 

304 # Fragments fully spanning the window boundaries 

305 span_start = starts + self.protection_size - regionStart 

306 span_end = ends - self.protection_size - regionStart + 2 

307 valid = span_end >= span_start 

308 outside = np.zeros(n + 2, dtype=int) 

309 np.add.at( 

310 outside, 

311 np.clip(span_start[valid] + 1, 0, n + 1), 

312 1, 

313 ) 

314 np.add.at( 

315 outside, 

316 np.clip(span_end[valid], 0, n + 1), 

317 -1, 

318 ) 

319 np.add.at( 

320 outside, 

321 np.clip(span_start[~valid] + 1, 0, n + 1), 

322 -1, 

323 ) 

324 np.add.at( 

325 outside, 

326 np.clip(span_end[~valid], 0, n + 1), 

327 1, 

328 ) 

329 

330 outside_cum = np.cumsum(outside)[:-1] 

331 

332 # Fragments whose endpoints fall inside windows 

333 all_ends = np.concatenate([starts, ends]) - regionStart 

334 left = np.clip(all_ends - self.protection_size + 2, 0, n + 1) 

335 right = np.clip(all_ends + self.protection_size + 1, 0, n + 1) 

336 inside = np.zeros(n + 2, dtype=int) 

337 np.add.at(inside, left, 1) 

338 np.add.at(inside, right, -1) 

339 inside_cum = np.cumsum(inside)[:-1] 

340 

341 wps = outside_cum - inside_cum 

342 else: 

343 outside_cum = np.zeros(n, dtype=int) 

344 inside_cum = np.zeros(n, dtype=int) 

345 wps = np.zeros(n, dtype=int) 

346 for i in range(regionEnd - regionStart + 1): 

347 if verbose_output: 

348 outfile.write( 

349 "%s\t%d\t%d\t%d\t%d\t%d\n" 

350 % ( 

351 chrom, 

352 regionStart + i, 

353 regionStart + i + 1, # add end coordinate 

354 outside_cum[i], 

355 inside_cum[i], 

356 wps[i], 

357 ) 

358 ) 

359 else: 

360 outfile.write( 

361 "%s\t%d\t%d\t%d\n" 

362 % ( 

363 chrom, 

364 regionStart + i, 

365 regionStart + i + 1, # add end coordinate 

366 wps[i], 

367 ) 

368 )