src.co_tools.co_fastq
1import os 2import re 3from glob import glob 4from pathlib import Path 5 6if os.getenv("CO_LOG").lower() == "true": 7 from .get_logger import LOGGER 8 9 log = LOGGER 10else: 11 import logging 12 13 log = logging.getLogger(__name__) 14 15 16def get_fastq_pair(dir_path: str = "../data"): 17 """This function returns a pair of paired-end reads files 18 19 Args: 20 dir_path (str, optional): The folder where all the reads files are. 21 Defaults to "../data". 22 23 Returns: 24 str: comma-separated pair of reads files as path to files 25 """ 26 total_dirs = 0 27 for base, dirs, files in os.walk(dir_path): 28 for dir in dirs: 29 total_dirs += 1 30 if total_dirs != 2: 31 log.error( 32 f"The fastq files in {dir_path} are not properly configured" 33 + " to use this function. There should be only 2 folders" 34 + " inside the data folder." 35 ) 36 return 0 37 prefix_dict, this_prefix = {}, None 38 fwd, rev = None, None 39 for path in glob(str(f"{dir_path}/**/*.fastq.gz"), recursive=True): 40 if prefix := get_prefix(path): 41 if prefix in prefix_dict: 42 prefix_dict[prefix].append(path) 43 if len(prefix_dict[prefix]) == 3: 44 log.info(f"prefix {prefix} occurs 3 times in the {dir_path} folder") 45 this_prefix = prefix 46 break 47 else: 48 prefix_dict[prefix] = [path] 49 else: 50 log.warning(f"No prefix determined for {path}") 51 if not prefix_dict: 52 log.warning(f"No files found in {dir_path}") 53 return 0 54 if not this_prefix: 55 log.warning(f"fastq files in {dir_path} not properly organized") 56 return 0 57 for path in prefix_dict[this_prefix]: 58 if get_read_direction(path) == "1": 59 fwd = path 60 elif get_read_direction(path) == "2": 61 rev = path 62 if fwd and rev: 63 log.info(f"returning {fwd},{rev}") 64 return f"{fwd},{rev}" 65 else: 66 log.warning(f"Could not find complementary pair of fastq files in {dir_path}") 67 return 0 68 69 70def get_fwd_fastqs(dir: str = "../data"): 71 """Returns all the forward reads files in ascending alphabetical order 72 73 Args: 74 dir (str, optional): The folder where all the reads file are. 75 Defaults to "../data". 76 77 Returns: 78 str: newline-separated string of forward reads files 79 """ 80 if fastq_files := glob(str(f"{dir}/**/*.fastq.gz"), recursive=True): 81 log.debug( 82 f"Found the following fastq files in the {dir} folder:\n{fastq_files}" 83 ) 84 pattern = get_read_pattern(fastq_files[0]) 85 fwd_fastqs_list = glob(str(f"{dir}/**/*{pattern}"), recursive=True) 86 fwd_fastqs_list.sort() 87 fwd_fastqs = "\n".join(fwd_fastqs_list) 88 log.debug(f"Returning the following fwd fastq files\n{fwd_fastqs}") 89 return fwd_fastqs 90 else: 91 log.error(f"There are no fastq.gz files in the {dir} directory") 92 return 0 93 94 95def get_read_direction(filepath: str): 96 """This function returns the direction of a single paired-end reads file 97 98 Args: 99 filepath (str): The path to the reads file you need the direction of 100 101 Returns: 102 str: Returns 1 if file is detected as forward, 2 otherwise 103 """ 104 filename = Path(filepath).name 105 log.debug(f"filename: {filename}") 106 if "_" not in filename: 107 log.warning( 108 "You might be trying to use a single end reads file as a paired" 109 + f" end reads file. Current input: {filepath}" 110 ) 111 return 0 112 return "1" if "1" in filename.split("_")[-1].split(".")[0] else "2" 113 114 115def get_read_pattern(filename: str, direction: str = "1"): 116 """This function returns the pattern shared for half the paired-end reads files 117 118 Args: 119 filename (str): Name of file to determine pattern from 120 direction (str, optional): The direction you need the pattern for. 121 Defaults to "1". Accepts "1" for forward or "2" for reverse 122 123 Returns: 124 str: The pattern for all the forward or reverse paired-end reads file 125 corresponding to the direction you specified in 'direction' 126 """ 127 if "_" not in filename and "/" in filename: 128 log.warning( 129 f"{filename} might be a single end reads file. The pattern being returned" 130 + " is the entire filename" 131 ) 132 return Path(filename).name 133 direction_complement = "2" if direction == "1" else "1" 134 pattern = filename.split("_")[-1] 135 log.debug(f"pattern: {pattern}") 136 return ( 137 pattern 138 if direction in pattern 139 else pattern.replace(direction_complement, direction) 140 ) 141 142 143def get_prefix(filename: str, split_position: str = "-1"): 144 """This function returns the prefix that is unique to (1) pair of paired-end files 145 146 Args: 147 filename (str): The name of the file to determine prefix from 148 split_position (str, optional): If underscores are in the filename and user 149 just needs to trim the filename after a certain underscore, then 150 this arg specifies where to trim e.g. 151 get_prefix("GSM1234_sample12_exp.fastq.gz", -1) returns "GSM1234_sample12"). 152 Defaults to "-1". 153 154 Returns: 155 str: Returns the prefix that is unique to a single pair of paired-end 156 reads files. 157 """ 158 filename = Path(filename).name 159 # illumina read files use a specific format, and sometimes allow underscores in the prefix 160 # SampleName_S1_L001_R1_001.fastq.gz for lane 1 161 # SampleName_S1_R1_001.fastq.gz for merged lanes. 162 163 if match := re.search(r"(.*?)_S\d+_.*R\d_001.fastq.gz", filename): 164 log.debug(f"match: {match}\ngroup 1 (prefix): {match.group(1)}") 165 return match.group(1) 166 167 if "_" in filename and int(split_position): 168 prefix_list = filename.split("_")[: int(split_position)] 169 log.debug(f"prefix_list: {prefix_list}") 170 return "_".join(prefix_list) 171 172 log.warning(f"A prefix was not able to be determined for {filename}") 173 return 0 174 175 176def get_rev_file( 177 fwd_file: str, name_only=False, pattern_fwd: bool = False, pattern_rev: bool = False 178): 179 """_summary_ 180 181 Args: 182 fwd_file (str): The forward file you want to find the reverse 183 file for. 184 name_only (bool, optional): Set to True if you want this function to 185 return only the filename. Defaults to False. 186 pattern_fwd (bool, optional): Specify the pattern to replace. 187 Defaults to False. 188 pattern_rev (bool, optional): Specify the replacement pattern. 189 Defaults to False. 190 191 Returns: 192 str: The reverse reads file 193 """ 194 if name_only: 195 name_only = True if "true" in str(name_only).lower() else False 196 if not pattern_fwd: 197 pattern_fwd = get_read_pattern(fwd_file, "1") 198 log.debug(f"Autodetected forward pattern: {pattern_fwd}") 199 if not pattern_rev: 200 pattern_rev = get_read_pattern(fwd_file, "2") 201 log.debug(f"Autodetected reverse pattern: {pattern_rev}") 202 log.debug( 203 f"fwd_file: {fwd_file}\nWill replace {pattern_fwd}" + f" with {pattern_rev}" 204 ) 205 return ( 206 Path( 207 fwd_file.replace( 208 pattern_fwd, 209 pattern_rev, 210 ) 211 ).name 212 if name_only 213 else fwd_file.replace( 214 pattern_fwd, 215 pattern_rev, 216 ) 217 )
17def get_fastq_pair(dir_path: str = "../data"): 18 """This function returns a pair of paired-end reads files 19 20 Args: 21 dir_path (str, optional): The folder where all the reads files are. 22 Defaults to "../data". 23 24 Returns: 25 str: comma-separated pair of reads files as path to files 26 """ 27 total_dirs = 0 28 for base, dirs, files in os.walk(dir_path): 29 for dir in dirs: 30 total_dirs += 1 31 if total_dirs != 2: 32 log.error( 33 f"The fastq files in {dir_path} are not properly configured" 34 + " to use this function. There should be only 2 folders" 35 + " inside the data folder." 36 ) 37 return 0 38 prefix_dict, this_prefix = {}, None 39 fwd, rev = None, None 40 for path in glob(str(f"{dir_path}/**/*.fastq.gz"), recursive=True): 41 if prefix := get_prefix(path): 42 if prefix in prefix_dict: 43 prefix_dict[prefix].append(path) 44 if len(prefix_dict[prefix]) == 3: 45 log.info(f"prefix {prefix} occurs 3 times in the {dir_path} folder") 46 this_prefix = prefix 47 break 48 else: 49 prefix_dict[prefix] = [path] 50 else: 51 log.warning(f"No prefix determined for {path}") 52 if not prefix_dict: 53 log.warning(f"No files found in {dir_path}") 54 return 0 55 if not this_prefix: 56 log.warning(f"fastq files in {dir_path} not properly organized") 57 return 0 58 for path in prefix_dict[this_prefix]: 59 if get_read_direction(path) == "1": 60 fwd = path 61 elif get_read_direction(path) == "2": 62 rev = path 63 if fwd and rev: 64 log.info(f"returning {fwd},{rev}") 65 return f"{fwd},{rev}" 66 else: 67 log.warning(f"Could not find complementary pair of fastq files in {dir_path}") 68 return 0
This function returns a pair of paired-end reads files
Args: dir_path (str, optional): The folder where all the reads files are. Defaults to "../data".
Returns: str: comma-separated pair of reads files as path to files
71def get_fwd_fastqs(dir: str = "../data"): 72 """Returns all the forward reads files in ascending alphabetical order 73 74 Args: 75 dir (str, optional): The folder where all the reads file are. 76 Defaults to "../data". 77 78 Returns: 79 str: newline-separated string of forward reads files 80 """ 81 if fastq_files := glob(str(f"{dir}/**/*.fastq.gz"), recursive=True): 82 log.debug( 83 f"Found the following fastq files in the {dir} folder:\n{fastq_files}" 84 ) 85 pattern = get_read_pattern(fastq_files[0]) 86 fwd_fastqs_list = glob(str(f"{dir}/**/*{pattern}"), recursive=True) 87 fwd_fastqs_list.sort() 88 fwd_fastqs = "\n".join(fwd_fastqs_list) 89 log.debug(f"Returning the following fwd fastq files\n{fwd_fastqs}") 90 return fwd_fastqs 91 else: 92 log.error(f"There are no fastq.gz files in the {dir} directory") 93 return 0
Returns all the forward reads files in ascending alphabetical order
Args: dir (str, optional): The folder where all the reads file are. Defaults to "../data".
Returns: str: newline-separated string of forward reads files
96def get_read_direction(filepath: str): 97 """This function returns the direction of a single paired-end reads file 98 99 Args: 100 filepath (str): The path to the reads file you need the direction of 101 102 Returns: 103 str: Returns 1 if file is detected as forward, 2 otherwise 104 """ 105 filename = Path(filepath).name 106 log.debug(f"filename: {filename}") 107 if "_" not in filename: 108 log.warning( 109 "You might be trying to use a single end reads file as a paired" 110 + f" end reads file. Current input: {filepath}" 111 ) 112 return 0 113 return "1" if "1" in filename.split("_")[-1].split(".")[0] else "2"
This function returns the direction of a single paired-end reads file
Args: filepath (str): The path to the reads file you need the direction of
Returns: str: Returns 1 if file is detected as forward, 2 otherwise
116def get_read_pattern(filename: str, direction: str = "1"): 117 """This function returns the pattern shared for half the paired-end reads files 118 119 Args: 120 filename (str): Name of file to determine pattern from 121 direction (str, optional): The direction you need the pattern for. 122 Defaults to "1". Accepts "1" for forward or "2" for reverse 123 124 Returns: 125 str: The pattern for all the forward or reverse paired-end reads file 126 corresponding to the direction you specified in 'direction' 127 """ 128 if "_" not in filename and "/" in filename: 129 log.warning( 130 f"{filename} might be a single end reads file. The pattern being returned" 131 + " is the entire filename" 132 ) 133 return Path(filename).name 134 direction_complement = "2" if direction == "1" else "1" 135 pattern = filename.split("_")[-1] 136 log.debug(f"pattern: {pattern}") 137 return ( 138 pattern 139 if direction in pattern 140 else pattern.replace(direction_complement, direction) 141 )
This function returns the pattern shared for half the paired-end reads files
Args: filename (str): Name of file to determine pattern from direction (str, optional): The direction you need the pattern for. Defaults to "1". Accepts "1" for forward or "2" for reverse
Returns: str: The pattern for all the forward or reverse paired-end reads file corresponding to the direction you specified in 'direction'
144def get_prefix(filename: str, split_position: str = "-1"): 145 """This function returns the prefix that is unique to (1) pair of paired-end files 146 147 Args: 148 filename (str): The name of the file to determine prefix from 149 split_position (str, optional): If underscores are in the filename and user 150 just needs to trim the filename after a certain underscore, then 151 this arg specifies where to trim e.g. 152 get_prefix("GSM1234_sample12_exp.fastq.gz", -1) returns "GSM1234_sample12"). 153 Defaults to "-1". 154 155 Returns: 156 str: Returns the prefix that is unique to a single pair of paired-end 157 reads files. 158 """ 159 filename = Path(filename).name 160 # illumina read files use a specific format, and sometimes allow underscores in the prefix 161 # SampleName_S1_L001_R1_001.fastq.gz for lane 1 162 # SampleName_S1_R1_001.fastq.gz for merged lanes. 163 164 if match := re.search(r"(.*?)_S\d+_.*R\d_001.fastq.gz", filename): 165 log.debug(f"match: {match}\ngroup 1 (prefix): {match.group(1)}") 166 return match.group(1) 167 168 if "_" in filename and int(split_position): 169 prefix_list = filename.split("_")[: int(split_position)] 170 log.debug(f"prefix_list: {prefix_list}") 171 return "_".join(prefix_list) 172 173 log.warning(f"A prefix was not able to be determined for {filename}") 174 return 0
This function returns the prefix that is unique to (1) pair of paired-end files
Args: filename (str): The name of the file to determine prefix from split_position (str, optional): If underscores are in the filename and user just needs to trim the filename after a certain underscore, then this arg specifies where to trim e.g. get_prefix("GSM1234_sample12_exp.fastq.gz", -1) returns "GSM1234_sample12"). Defaults to "-1".
Returns: str: Returns the prefix that is unique to a single pair of paired-end reads files.
177def get_rev_file( 178 fwd_file: str, name_only=False, pattern_fwd: bool = False, pattern_rev: bool = False 179): 180 """_summary_ 181 182 Args: 183 fwd_file (str): The forward file you want to find the reverse 184 file for. 185 name_only (bool, optional): Set to True if you want this function to 186 return only the filename. Defaults to False. 187 pattern_fwd (bool, optional): Specify the pattern to replace. 188 Defaults to False. 189 pattern_rev (bool, optional): Specify the replacement pattern. 190 Defaults to False. 191 192 Returns: 193 str: The reverse reads file 194 """ 195 if name_only: 196 name_only = True if "true" in str(name_only).lower() else False 197 if not pattern_fwd: 198 pattern_fwd = get_read_pattern(fwd_file, "1") 199 log.debug(f"Autodetected forward pattern: {pattern_fwd}") 200 if not pattern_rev: 201 pattern_rev = get_read_pattern(fwd_file, "2") 202 log.debug(f"Autodetected reverse pattern: {pattern_rev}") 203 log.debug( 204 f"fwd_file: {fwd_file}\nWill replace {pattern_fwd}" + f" with {pattern_rev}" 205 ) 206 return ( 207 Path( 208 fwd_file.replace( 209 pattern_fwd, 210 pattern_rev, 211 ) 212 ).name 213 if name_only 214 else fwd_file.replace( 215 pattern_fwd, 216 pattern_rev, 217 ) 218 )
_summary_
Args: fwd_file (str): The forward file you want to find the reverse file for. name_only (bool, optional): Set to True if you want this function to return only the filename. Defaults to False. pattern_fwd (bool, optional): Specify the pattern to replace. Defaults to False. pattern_rev (bool, optional): Specify the replacement pattern. Defaults to False.
Returns: str: The reverse reads file