/* fasta2tfasta ============ This program requires one argument: The name of a file. The file should have FASTA format. The program will output one line per sequence ("tFASTA format). Each line will have three columns, separated by TABs: - identifier - comment - sequence To compile this source code, you need to have "boost" installed: http://boost.org/ boost seems to be part of any modern open source operating system. Compilation on Linux (add stuff like "-O2 -static -march=pentiumpro" if you want to try to tune it a bit): g++ -o fasta2tfasta fasta2tfasta.cc - Or, if you have trouble running the binary across Linux versions, you may perform a static compilation: g++ -static -o fasta2tfasta fasta2tfasta.cc The source code is pure ISO C++, except for the use of the "long long" data type; however most (all?) compilers should accept "long long" by now. */ /* Copyright (C) 2005 Troels Arvin . This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include "string.h" using namespace std; static void usage(char* arg0) { char* progname=basename(arg0); cerr << progname << ":" << endl; cerr << "Reads through a FASTA formatted file." << endl; cerr << "Outputs one line per sequence." << endl; cerr << "Each output line has three tab-delimited columns (\"tFASTA\" format):" << endl; cerr << " - identifier" << endl; cerr << " - comment" << endl; cerr << " - sequence" << endl; cerr << endl; cerr << "I.e., if the input file has lines like" << endl << endl; cerr << ">abc Interesting sequence" << endl; cerr << "CAGTTGAA" << endl; cerr << "GGGGAATTT" << endl << endl; cerr << "then the program will output" << endl << endl; cerr << "abc Interesting sequence CAGTTGAAGGGGAATTT" << endl; cerr << endl; cerr << "WARNING:" << endl; cerr << "If the comment-section of a FASTA header line contains TABs," << endl; cerr << "they will be converted to spaces, one space per TAB, and a warning" << endl; cerr << "will be printed to STDERR." << endl; cerr << "If a sequence contains TABs, the program will halt with an error message." << endl; cerr << "All instances of \\r (DOS line endings) will be removed." << endl; cerr << "In the sequence data, all spaces will be removed." << endl; cerr << endl; cerr << "Usage:" << endl << " " << basename(progname) << " " << endl; cerr << "filename: name of file containing FASTA-formatted data" << endl <& parts, long long linenr ) { string::size_type pos; string comment; string::size_type sz=str.length(); // look for first whitespace pos=str.find_first_of(" \t", 0); if (pos==string::npos) { parts.push_back(str.substr(1,str.length()-1)); } else { parts.push_back(str.substr(1,pos-1)); } // skip whitespace pos = str.find_first_not_of(" \t\r", pos); // return a one-element vector if no comment exists if (string::npos==pos) return; // put rest of str in comment comment=str.substr(pos,sz); if (string::npos!=comment.find_first_of("\t",0)) { cerr << "warning: TAB seen at line " << linenr << endl; } boost::replace_all(comment,"\t"," "); boost::erase_all(comment,"\r"); parts.push_back(comment); } int main(const int argc, char* argv[]) { // some declarations typedef string::const_iterator iter; string id; string comment; stringstream seq; stringstream msg; bool header_seen=0; iter head; iter end; string line; char firstchar; long long linenr=0; vector parts; ifstream is; if (argc>2) { usage(argv[0]); } // arg handling char* fname=0; if (2==argc && strlen(argv[1])>0) { fname=argv[1]; if (strncmp(fname,"-h",2)==0) usage(argv[0]); if (strncmp(fname,"--help",6)==0) usage(argv[0]); } // open file, or point to stdin if (fname) { is.open(fname,ifstream::in); if (!is) { msg << "Could not open file " << fname << " for reading"; err(msg); } } while (getline(fname ? is:cin,line)) { if (line.length()!=0) { firstchar=line.at(0); if (';'!=firstchar) { if ('>'==firstchar) { header_seen=1; /* output a sequence, if one exists; then clear it */ if (seq.str().length()>0) { cout << id << '\t' << comment << '\t' << seq.str() << endl; } seq.str(""); /* locate ID, comment */ tokenize_defline(line,parts,linenr); id=parts.at(0); if (parts.size()>1) { comment=parts.at(1); } else { comment=""; } parts.clear(); } else { /* add to sequence, unless it contains a tab */ if (!header_seen) { msg << "No header seen at line " << linenr; err(msg); } if (string::npos!=line.find_first_of("\t",0)) { msg << "Saw a tab in the sequence at line " << linenr; err(msg); } boost::erase_all(line,"\r"); boost::erase_all(line," "); seq << line; } } } linenr++; } /* output a sequence, if one exists */ if (seq.str().length()>0) { cout << id << '\t' << comment << '\t' << seq.str() << endl; } if (fname) { is.close(); } return 0; }