/* tfasta2fasta ============ This program reads "tFASTA" input and outputs FASTA. See the "usage" function below for a description. To compile this source code, you need to have "boost" installed: http://boost.org/ boost seems to be part of any modern open source operating system. Compilation on Linux (add stuff like "-O2 -static -march=pentiumpro" if you want to try to tune it a bit): g++ -o tfasta2fasta tfasta2fasta.cc - Or, if you have trouble running the binary across Linux versions, you may perform a static compilation: g++ -static -o tfasta2fasta tfasta2fasta.cc The source code is pure ISO C++, except for the use of the "long long" data type; however most (all?) compilers should accept "long long" by now. */ /* Copyright (C) 2005 Troels Arvin . This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ #include #include #include #include #include #include #include #include #include using namespace std; using namespace boost; static const int DEFAULT_CHARS_PER_LINE=60; static void usage(char* arg0) { char* progname=basename(arg0); cerr << progname << ":" << endl; cerr << "Reads through \"tFASTA\" formatted input." << endl; cerr << endl; cerr << "For each entry in the tFASTA input, the program will output at least one line:" << endl; cerr << " - a FASTA header line" << endl; cerr << " - zero or more lines of sequence characters, at most" << endl; cerr << " 80 chacters per line" << endl; cerr << endl; cerr << "I.e., if the input file has a line like" << endl; cerr << endl; cerr << "abc Interesting sequence CAGTTGAAGGGGAATTT" << endl; cerr << endl; cerr << "then the program will output" << endl; cerr << endl; cerr << ">abc Interesting sequence" << endl; cerr << "CAGTTGAAGGGGAATTT" << endl; cerr << endl; cerr << endl; cerr << "\"tFASTA\" is an alternative FASTA format where each sequence occupies one and" << endl; cerr << "only one line, including header data. The header ID, the header comment, and the" << endl; cerr << "sequence data are separated by TABs." << endl; cerr << endl; cerr << "Usage:" << endl << " " << basename(progname) << " [-n count] [filename]" << endl; cerr << "count: characters per line; 0 is illegal; default is " << DEFAULT_CHARS_PER_LINE << endl; cerr << "filename: name of file containing FASTA-formatted data; if empty, stdin is used" << endl < sep_t; typedef tokenizer tok_t; typedef tok_t::const_iterator tok_it_t; typedef string::const_iterator iter; string id; string comment; stringstream seq; stringstream msg; iter head; iter end; string line; long long linenr=0; long seq_pos=0; long seq_len=0; vector parts; ifstream is; unsigned short num_toks; unsigned int chars_per_line=0; char* fname=0; const char* opts="n:"; int res; while((res=getopt(argc,argv,opts))!=-1) { switch(res) { case 'n': chars_per_line=(unsigned int)atoi(optarg); if (0==chars_per_line) usage(argv[0]); break; default : usage(argv[0]); } } if (optind+1==argc) { fname=argv[optind]; } if (0==chars_per_line) { chars_per_line=DEFAULT_CHARS_PER_LINE; } // arg handling //if (strncmp(fname,"--help",6)==0) usage(argv[0]); // open file, or point to stdin if (fname) { is.open(fname,ifstream::in); if (!is) { msg << "Could not open file " << fname << " for reading"; err(msg); } } sep_t sep("\t"); tok_it_t it; while (getline(fname ? is:cin,line)) { num_toks=0; tok_t tok(line,sep); for(it=tok.begin(); it!=tok.end(); it++,num_toks++) { switch(num_toks) { case 0: cout << '>' << *it; break; case 1: cout << ' ' << *it; break; case 2: cout << endl; // split sequence into lines of max chars_per_line chars // (silently ignoring the case where sequence length // larger than size of a 'long'; TODO) seq_pos=0; seq_len=it->length(); while(seq_possubstr(seq_pos,chars_per_line) << endl; seq_pos+=chars_per_line; } break; default: msg << "more than three columns seen at line number " << linenr; err(msg); } } if (num_toks<3) { cerr << "WARNING: At line " << linenr << ": only " << num_toks << " columns" << endl; cout << endl; } if (0==seq_pos) { cout << endl; } linenr++; } /* output a sequence, if one exists */ if (seq.str().length()>0) { cout << id << '\t' << comment << '\t' << seq.str() << endl; } if (fname) { is.close(); } return 0; }