From bc4526da3328b73e6755db640f07928aa056d1c4 Mon Sep 17 00:00:00 2001 From: Guoyi Zhang Date: Tue, 10 Sep 2024 02:02:03 +1000 Subject: [PATCH] polish: use D instead cpp version --- delstop.cpp | 181 ---------------------------------------------------- delstop.d | 165 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+), 181 deletions(-) delete mode 100644 delstop.cpp create mode 100644 delstop.d diff --git a/delstop.cpp b/delstop.cpp deleted file mode 100644 index 911b1ac..0000000 --- a/delstop.cpp +++ /dev/null @@ -1,181 +0,0 @@ -/* - * This script is for deleting stop codon - * in both AA and original_NT files - * Copyright - * Guoyi Zhang, UNSW & Australian Museum - */ - -#include -#include -#include -#include -#include -#include - -namespace fs = std::filesystem; - -class FastaProcessor { - public: - FastaProcessor(const std::string& inputFile, - const std::string& outputFile) - : input_file(inputFile), output_file(outputFile) {} - - void processFile() { - // bakup input file - std::string backup_file = input_file + ".bak"; - fs::copy_file(input_file, backup_file, - fs::copy_options::overwrite_existing); - - std::ifstream infile(input_file); - std::ofstream outfile(input_file + ".tmp"); // temp file - std::string line; - - while (std::getline(infile, line)) { - if (line[0] == '>') { - if (!sequence.empty()) { - positions_map[taxa] = - processSequence(sequence); - outfile << taxa << "\n" - << sequence << "\n"; - } - taxa = line; - sequence.clear(); - } else { - sequence = line; - } - } - - // process final taxa - if (!sequence.empty()) { - positions_map[taxa] = processSequence(sequence); - outfile << taxa << "\n" << sequence << "\n"; - } - - infile.close(); - outfile.close(); - - // overwrite with temp file - fs::rename(input_file + ".tmp", input_file); - - // process same name file - if (fs::exists(output_file)) { - // backup outputfile - std::string output_backup_file = output_file + ".bak"; - fs::copy_file(output_file, output_backup_file, - fs::copy_options::overwrite_existing); - - processOutputFile(); - fs::rename(output_file + ".tmp", output_file); - } - } - - private: - std::string input_file; - std::string output_file; - std::string taxa; - std::string sequence; - std::unordered_map> positions_map; - - std::vector processSequence(std::string& sequence) { - std::vector positions; - size_t non_dash_index = 0; - - for (size_t i = 0; i < sequence.size(); ++i) { - if (sequence[i] != '-') { - non_dash_index++; - if (sequence[i] == '!' || sequence[i] == '*') { - positions.push_back( - non_dash_index); // pos without - - sequence[i] = '-'; // replace with - - } - } - } - return positions; - } - - void processOutputFile() { - std::ifstream infile(output_file); - std::ofstream outfile(output_file + ".tmp"); - std::string line; - std::string current_taxa; - std::string current_sequence; - - while (std::getline(infile, line)) { - if (line[0] == '>') { - if (!current_sequence.empty() && - positions_map.find(current_taxa) != - positions_map.end()) { - replacePositions( - current_sequence, - positions_map[current_taxa]); - outfile << current_taxa << "\n" - << current_sequence << "\n"; - } - current_taxa = line; - current_sequence.clear(); - } else { - current_sequence = line; - } - } - - // process final taxa - if (!current_sequence.empty() && - positions_map.find(current_taxa) != positions_map.end()) { - replacePositions(current_sequence, - positions_map[current_taxa]); - outfile << current_taxa << "\n" - << current_sequence << "\n"; - } - - infile.close(); - outfile.close(); - } - - void replacePositions(std::string& sequence, - const std::vector& positions) { - std::string new_sequence = sequence; - size_t sequence_length = sequence.size(); - - for (size_t pos : positions) { - size_t start = (pos * 3) - 3; // 3n-2 - size_t end = pos * 3; // 3n - - if (start < sequence_length) { - for (size_t i = start; - i < end && i < sequence_length; ++i) { - new_sequence[i] = '-'; - } - } - } - - sequence = new_sequence; - } -}; - -int main(int argc, char* argv[]) { - if (argc != 3) { - std::cerr << "Usage: " << argv[0] - << " " << std::endl; - return 1; - } - - std::string input_folder = argv[1]; - std::string output_folder = argv[2]; - - // get all fasta - for (const auto& entry : fs::directory_iterator(input_folder)) { - if (entry.path().extension() == ".fasta" || - entry.path().extension() == ".fas") { - std::string input_file = entry.path().string(); - std::string output_file = - output_folder + "/" + - entry.path().filename().string(); - - FastaProcessor processor(input_file, output_file); - processor.processFile(); - } - } - - return 0; -} - diff --git a/delstop.d b/delstop.d new file mode 100644 index 0000000..f0af3a0 --- /dev/null +++ b/delstop.d @@ -0,0 +1,165 @@ +import std.file; +import std.stdio; +import std.string; +import std.algorithm; +import std.array; +import std.conv; + +// contains taxa and pos +struct TaxaInfo { + string name; + int[] positions; +} + +// include TaxaInfo and foundSpecialChar +struct FastaData { + TaxaInfo[] taxaList; // every taxa correspeonds to TaxaInfo + bool foundSpecialChar; +} + +void backup(string filePath) { + string backupPath = filePath ~ ".bak"; + copy(filePath, backupPath); +} + +string replaceSpecialChars(string sequence, const(int)[] positions) { + char[] mutableSequence = sequence.dup; // change to editable + foreach (pos; positions) { + mutableSequence[pos - 1] = '-'; // replace `!` and `*` with `-` + } + sequence = mutableSequence.idup; // change to not editable + return sequence; +} + +int[] findSpecialPositions(const string sequence) { + int[] positions; + foreach (i, char c; sequence) { + if (c == '*' || c == '!') { + positions ~= cast(int) (i + 1); // record `int` pos + } + } + return positions; +} + +FastaData processFastaAA(string filePath) { + FastaData fastaData; // create struct + string[] lines = File(filePath).byLine().map!(line=>line.to!string).array; // read file + string currentTaxa; + string currentSequence; + string tmpContent; + + for (int i = 0; i < lines.length; i++) { + if (lines[i].startsWith(">")) { + if (!currentTaxa.empty) { + // keep last sequence + TaxaInfo taxaInfo; + taxaInfo.name = currentTaxa; + taxaInfo.positions = findSpecialPositions(currentSequence); + fastaData.taxaList ~= taxaInfo; // add TaxaInfo to taxaList + + if (taxaInfo.positions.length > 0) { + fastaData.foundSpecialChar = true; // check any special char + } + + currentSequence = replaceSpecialChars(currentSequence, taxaInfo.positions); // replace + tmpContent ~= ">" ~ currentTaxa ~ "\n" ~ currentSequence ~ "\n"; + currentSequence = ""; // clean prepare next + } + currentTaxa = lines[i][1..$]; // get taxa + } else { + currentSequence ~= lines[i]; // get seq + } + } + + // the final one + if (!currentTaxa.empty) { + TaxaInfo taxaInfo; + taxaInfo.name = currentTaxa; + taxaInfo.positions = findSpecialPositions(currentSequence); + fastaData.taxaList ~= taxaInfo; + + if (taxaInfo.positions.length > 0) { + fastaData.foundSpecialChar = true; + } + + currentSequence = replaceSpecialChars(currentSequence, taxaInfo.positions); + } + if(fastaData.foundSpecialChar){ + backup(filePath); + // write back + std.file.write(filePath, tmpContent); + } + return fastaData; +} + +void processFastaNT(string filePath, const FastaData fastaData) { + string[] lines = File(filePath).byLine().map!(line=>line.to!string).array; // get fasta_nt + + for (int i = 0; i < lines.length; i++) { + if (lines[i].startsWith(">")) { + string currentTaxa = lines[i][1..$]; + auto taxaIndex = fastaData.taxaList.countUntil!(t => t.name == currentTaxa); // use countUntil find taxaName + + if (taxaIndex != fastaData.taxaList.length) { // if find taxa + int[] positions = fastaData.taxaList[taxaIndex].positions.dup; + + int lineIndex = i + 1; + string sequence; + while (lineIndex < lines.length && !lines[lineIndex].startsWith(">")) { + sequence ~= lines[lineIndex]; // get full + lineIndex++; + } + + // replace + char[] mutableSequence = sequence.dup; + foreach (pos; positions) { + int startPos = 3 * pos - 2; // 3n-2 + int endPos = 3 * pos; // 3n + if (startPos - 1 < mutableSequence.length) { + for (int j = startPos - 1; j < endPos && j < mutableSequence.length; j++) { + mutableSequence[j] = '-'; // 3n-2 to 3n '-' + } + } + } + sequence = mutableSequence.idup; + + // write to fasta_nt + int sequencePos = 0; + lineIndex = i + 1; + while (lineIndex < lines.length && !lines[lineIndex].startsWith(">")) { + int len = cast(int) lines[lineIndex].length; // convert to int + lines[lineIndex] = sequence[sequencePos..sequencePos + len]; + sequencePos += len; + lineIndex++; + } + } + } + } + // write back + std.file.write(filePath, lines.join("\n")); +} + +void processFasta(string fasta_aa, string fasta_nt){ + // get pos from fasta_aa & modify + FastaData fastaData = processFastaAA(fasta_aa); + + if (fastaData.foundSpecialChar) { + // if found special, proces fasta_nt + // backup fasta_aa + backup(fasta_nt); // modify fasta_nt + processFastaNT(fasta_nt, fastaData); // modify fasta_nt + } +} + +void main(string[] args) { + if (args.length < 3) { + writeln("Usage: program "); + return; + } + + string fasta_aa = args[1]; + string fasta_nt = args[2]; + + processFasta(fasta_aa, fasta_nt); +} +