-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathsplit_and_sort.new2.py
More file actions
executable file
·64 lines (51 loc) · 2.3 KB
/
split_and_sort.new2.py
File metadata and controls
executable file
·64 lines (51 loc) · 2.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python
from Bio import SeqIO
import argparse
import bz2
import itertools
import multiprocessing as mp
import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
def index_fastq(file_path):
logger.info(f"Indexing FASTQ file: {file_path}")
return SeqIO.index(file_path, "fastq")
def write_fastq_bz2(task):
read_iterator, output_file = task
logger.info(f"Writing {output_file}")
with bz2.open(output_file, 'wt') as f:
SeqIO.write(read_iterator, f, "fastq")
logger.info(f"Finished writing")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--R1", help="R1 input file", required=True)
parser.add_argument("--R2", help="R2 input file", required=True)
parser.add_argument("-p", '--prefix', help="Output prefix", default='out')
parser.add_argument("-u", '--unpaired', help="File with unpaired read IDs")
args = parser.parse_args()
logger.info("Starting split_and_sort")
logger.info(f"Arguments: {args}")
R1_index = index_fastq(args.R1)
R2_index = index_fastq(args.R2)
logger.info("Creating read iterators")
paired_r1_id_sorted = sorted((r1_id for r1_id in R1_index if r1_id in R2_index))
paired_r2_id_sorted = sorted((r2_id for r2_id in R2_index if r2_id in R1_index))
paired_r1_iterator = (R1_index[r1_id] for r1_id in paired_r1_id_sorted)
paired_r2_iterator = (R2_index[r2_id] for r2_id in paired_r2_id_sorted)
tasks = [(paired_r1_iterator, args.prefix + '_R1.fastq.bz2'),
(paired_r2_iterator, args.prefix + '_R2.fastq.bz2')]
if args.unpaired:
logger.info(f"Reading unpaired reads")
unpaired_reads = set(line.strip() for line in bz2.open(args.unpaired, 'rt'))
logger.info(f"Creating unpaired iterator")
unpaired_reads_iter = itertools.chain(
(R1_index[r1_id] for r1_id in R1_index if r1_id in unpaired_reads),
(R2_index[r2_id] for r2_id in R2_index if r2_id in unpaired_reads)
)
tasks.append((unpaired_reads_iter, args.prefix + '_UN.fastq.bz2'))
logger.info("Starting writing output files")
for task in tasks:
write_fastq_bz2(task)
logger.info("split_and_sort completed successfully")
if __name__ == "__main__":
main()