From e09037058668b61a52b44e6c55053f6385ad2c66 Mon Sep 17 00:00:00 2001 From: Guoyi Zhang Date: Wed, 3 Jul 2024 12:09:16 +1000 Subject: [PATCH] add: sortdiamond and splitfasta --- CMakeLists.txt | 12 ++++ sortdiamond.cpp | 143 ++++++++++++++++++++++++++++++++++++++++++++++++ splitfasta.cpp | 81 +++++++++++++++++++++++++++ 3 files changed, 236 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 sortdiamond.cpp create mode 100644 splitfasta.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..a25be68 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 3.0) + +project(RGB EPP) +SET( CMAKE_EXPORT_COMPILE_COMMANDS ON ) + +set(CMAKE_CXX_STANDARD 17) + +add_executable(splitfasta splitfasta.cpp) +add_executable(sortdiamond sortdiamond.cpp) + +target_compile_options(splitfasta PRIVATE -Wall -Wextra -pedantic -O3) +target_compile_options(sortdiamond PRIVATE -Wall -Wextra -pedantic -O3) diff --git a/sortdiamond.cpp b/sortdiamond.cpp new file mode 100644 index 0000000..2ac5077 --- /dev/null +++ b/sortdiamond.cpp @@ -0,0 +1,143 @@ +#include +#include +#include +#include +#include +#include + +using namespace std; + +// Function to generate reverse complement of a DNA sequence +string revcomp(const string &seq) { + string revseq; + for (auto it = seq.rbegin(); it != seq.rend(); ++it) { + switch (*it) { + case 'A': + revseq += 'T'; + break; + case 'T': + revseq += 'A'; + break; + case 'C': + revseq += 'G'; + break; + case 'G': + revseq += 'C'; + break; + case 'R': + revseq += 'Y'; + break; + case 'Y': + revseq += 'R'; + break; + case 'S': + revseq += 'S'; + break; + case 'W': + revseq += 'W'; + break; + case 'K': + revseq += 'M'; + break; + case 'M': + revseq += 'K'; + break; + case 'B': + revseq += 'V'; + break; + case 'D': + revseq += 'H'; + break; + case 'H': + revseq += 'D'; + break; + case 'V': + revseq += 'B'; + break; + case 'N': + revseq += 'N'; + break; + default: + break; + } + } + return revseq; +} + +int main(int argc, char *argv[]) { + if (argc != 3) { + cerr << "Usage: " << argv[0] << " " + << endl; + return 1; + } + + string in_name = argv[1]; + string ot_name = argv[2]; + + ifstream infile(in_name); + if (!infile) { + cerr << "Error opening input file: " << in_name << endl; + return 1; + } + + ofstream outfile(ot_name); + if (!outfile) { + cerr << "Error opening output file: " << ot_name << endl; + return 1; + } + + map> + max_map; // Key: sseqid, Value: pair + string line; + + while (getline(infile, line)) { + istringstream iss(line); + string fields[20]; // Adjust size if needed + int i = 0; + + while (iss >> fields[i] && i < 20) { + i++; + } + + string key = fields[1]; + double value = stod(fields[11]); + + if (max_map.find(key) != max_map.end()) { + if (value > max_map[key].first) { + max_map[key] = make_pair(value, line); + } + } else { + max_map[key] = make_pair(value, line); + } + } + + vector> result; + + for (const auto &entry : max_map) { + istringstream iss(entry.second.second); + string fields[20]; // Adjust size if needed + int i = 0; + + while (iss >> fields[i] && i < 20) { + i++; + } + + if (stoi(fields[6]) > stoi(fields[7])) { + fields[17] = revcomp(fields[17]); + } + + result.push_back(make_pair(">" + fields[1], fields[17])); + } + + sort(result.begin(), result.end()); + + for (const auto &entry : result) { + outfile << entry.first << "\n" << entry.second << endl; + } + + infile.close(); + outfile.close(); + + return 0; +} + diff --git a/splitfasta.cpp b/splitfasta.cpp new file mode 100644 index 0000000..b25c539 --- /dev/null +++ b/splitfasta.cpp @@ -0,0 +1,81 @@ +// #include + +#include +#include +#include +// #include +#include +// #include + +namespace fs = std::filesystem; + +std::string removeExtension(const std::string& filename) { + size_t lastdot = filename.find_last_of("."); + if (lastdot != std::string::npos) { + return filename.substr(0, lastdot); + } + return filename; +} + +void splitFasta(const std::string& input_fasta) { + std::ifstream infile(input_fasta); + if (!infile) { + std::cerr << "Error: Unable to open input file " << input_fasta + << std::endl; + return; + } + + std::string line; + std::string dir_name; + std::ofstream outfile; + bool in_sequence = false; + + while (std::getline(infile, line)) { + if (line.empty()) continue; + + if (line[0] == '>') { + // New sequence header + if (in_sequence) { + outfile.close(); + } + dir_name = line.substr(1); // Remove '>' + // mkdir(dir_name.c_str(), 0777); // Create + // directory + fs::create_directories(dir_name); + fs::path output_file = fs::path(dir_name) / input_fasta; + outfile.open(output_file); + outfile << ">" << removeExtension(input_fasta) + << std::endl; + in_sequence = true; + } else if (in_sequence) { + // Inside sequence content + outfile << line << std::endl; + } + } + + if (in_sequence) { + outfile.close(); + } + + if (infile.eof()) { + std::cout << "Sequences have been split into individual files." + << std::endl; + } else { + std::cerr << "Error occurred while reading file." << std::endl; + } + + infile.close(); +} + +int main(int argc, char* argv[]) { + if (argc != 2) { + std::cerr << "Usage: " << argv[0] << " " + << std::endl; + return 1; + } + + splitFasta(argv[1]); + + return 0; +} +