// aligets.h - reeads a multi-alignment with a weight line *** 01.06.2001 // f_toupper() // extNamSeq() // fgetsCut() // f_cutName() // f_getAlign() // f_setWeights() // using a weight matrix /***** void extNamSeq( char *buf, char* &pNam, char* &pSeq ) { int len = strlen( buf ); // 1. Find the very last blank while( len && buf[ len ] != ' ' ) { if( !isalpha( buf[ len ] ) && buf[ len ] != '-' ) buf[ len ] = 0; len--; } // 2. Pointer to the sequence is found pSeq = buf + len + 1; // 3. while( len && buf[ len ] == ' ' ) len--; buf[ len + 1 ] = 0; pNam = buf; } Align* readAlign( FILE *afile ) { int i; char buf[ 1028 ]; char *pNam, *pSeq; // 1. Skip first blanks. while( fgets( buf, 1028, afile ) ) if( strlen( buf ) > 2 ) break; // 2. Count number of sequences. int numSeq = 1; while( fgets( buf, 1028, afile ) ) { if( strlen( buf ) < 2 ) break; numSeq++; } // 3. Count sequences' length. fseek( afile, 0, SEEK_SET ); int *aLen = new int [ numSeq ]; memset( aLen, 0, numSeq*sizeof( aLen [ 0 ] ) ); while( fgets( buf, 1028, afile ) ) { if( strlen( buf ) > 2 ) { extNamSeq( buf, pNam, pSeq ); i = 0; aLen[ i ] += strlen( pSeq ); for( i = 1; i < numSeq; i++ ) { fgets( buf, 1028, afile ); extNamSeq( buf, pNam, pSeq ); aLen[ i ] += strlen( pSeq ); } } } // Check if all sequences n ave the same length. for( i = 1; i < numSeq; i++ ) if( aLen[ 0 ] != aLen[ i ] ) return NULL; int len = aLen[ 0 ]; delete aLen; // 4. Allocate rooms char **name = new char* [ numSeq ]; char **let = new char* [ numSeq ]; char **att = new char* [ numSeq ]; for( i = 0; i < numSeq; i++ ) { let[ i ] = new char [ len+1 ]; let[ i ][ 0 ] = 0; att[ i ] = new char [ len+1 ]; } // 5. At last read fseek( afile, 0, SEEK_SET ); int flag = 1; while( fgets( buf, 1028, afile ) ) { if( strlen( buf ) > 2 ) { i = 0; do { // This function extracts name & sequence pointes. extNamSeq( buf, pNam, pSeq ); if( flag ) { name[ i ] = new char [ strlen( pNam ) + 1 ]; strcpy( name[ i ], pNam ); } strcat( let[ i ], pSeq ); fgets( buf, 1028, afile ); i++; } while( i < numSeq ); flag = 0; } } for( i = 0; i < numSeq; i++ ) { for( int j = 0; j < len; j++ ) { char ch = let[ i ][ j ]; att[ i ][ j ] = ( isalpha( ch ) && isupper( ch ) ) ? A_HOMOLOGY : 0; } } Align *ali = new Align( numSeq, len , name, let, att ); return ali; } *****/ //void f_str2Upper( string& s ) { void f_toupper( char *s ) { long i, iL = strlen( s ); for( i = 0; i < iL; ++i ) { //*s = toupper( *s ); //++s; s[i] = toupper( s[i] ); } } void extNamSeq( char *buf, char* &pNam, char* &pSeq ) { int len = strlen( buf ); // 1. Find the very last blank (group) - find end of seq. while( len && ( buf[ len-1 ] == ' ' || buf[ len-1 ] == '\n' || buf[ len-1 ] == '\r' || buf[ len-1 ] == '\t' ) ) { buf[ len - 1 ] = 0; // OK also: buf[ --len ] = 0; len--; } // 1a. Find start of seq while( len && buf[ len ] != ' ' ) { if( !isalpha( buf[ len ] ) && buf[ len ] != '-' ) buf[ len ] = 0; // ?????? What for ? Maybe some digits, etc... len--; } // 2. Pointer to the sequence is found pSeq = buf + len + 1; // 3. "Strip off" blanks in front of seq - (almost) rtrim( seqName ) while( len && buf[ len ] == ' ' ) len--; buf[ len + 1 ] = 0; pNam = buf; //cout << " - len = " << len << endl; //getch(); // To upper case: f_toupper( pSeq ); } char* fgetsCut( char *str, int num, FILE *stream ) { char *ret = fgets( str, num, stream ); if( ret ) { if( str[0] == ' ' ) return ret; // ---------------> WL int len = strlen( str ); while( len && !isalpha( str[ len ] ) && str[ len ] != '-' ) str[ len-- ] = 0; } return ret; } /*** void cutNames( int numSeq, char** name ) { for( int i = 0; i < numSeq; i++ ) { char *curName = name[ i ]; int len = strlen( curName ) - 1; if( curName[ len-- ] == ')' ) { while( len >= 0 && ( curName[ len ] == ' ' || isdigit( curName[ len ] ) ) ) len--; if( len && curName[ len ] == '(' ) curName[ len-- ] = 0; while( len >= 0 && curName[ len ] == ' ' ) curName[ len-- ] = 0; } } } ***/ void f_cutName( char* curName ) { int len = strlen( curName ) - 1; if( curName[ len-- ] == ')' ) { while( len >= 0 && ( curName[ len ] == ' ' || isdigit( curName[ len ] ) ) ) len--; if( len && curName[ len ] == '(' ) curName[ len-- ] = 0; while( len >= 0 && curName[ len ] == ' ' ) curName[ len-- ] = 0; } } void f_getAlign( FILE *afile, vSNamesDef& vSName, vAliDef& vAli, string& sWL ) { char *ret = "1"; int iFrag = 0, iSeq = 0, iSeqLen=0; char buf[_ALIGN_BUFFER_LENGTH]; char *pNam, *pSeq; string sWLf=""; // one fragment of Weight Line bool bWLtop = false, bWLbottom = false; sWL = ""; // Loop on fragments while( ret ) { ++iFrag; //Skip non-seqs: /** while( ret = fgetsCut( buf, _ALIGN_BUFFER_LENGTH, afile ) ) { if( strlen( buf ) > 2 || buf[0] != ' ' ) break; // it is a seq } **/ while( ret = fgets( buf, _ALIGN_BUFFER_LENGTH, afile ) ) { if( strlen( buf ) > 2 && buf[0] != ' ' ) break; // it is a seq if( strlen( buf ) > 2 ) { sWLf = buf; #ifdef _TRACE cout << "*** sWLf = " << sWLf << endl; _GETCH; #endif } } if( !ret ) break; // ------------------> iSeq = 0; //Loop on seqs inside one fragment: //while( ret = fgetsCut( buf, _ALIGN_BUFFER_LENGTH, afile ) ) { do{ if( strlen( buf ) < 3 || buf[0] == ' ' ) { /** if( iFrag == 1 ) { if( sWLf.length() > 0 ) { cout << " - WL Top " << endl; getch(); bWLtop = true; } else if( strlen( buf ) > 2 ) { cout << " - WL Bottom " << endl; getch(); bWLbottom = true; } } if( bWLtop ) { sWL += sWLf.substr( pSeq - buf, iSeqLen ); } else if( bWLbottom ) { *(pSeq + iSeqLen) = 0; sWL += string( pSeq ); } **/ break; // fragment ended } //if(iSeq == 12 ) { // cout << "*** iSeq = 12, buf = " << buf << endl; //} // Seq // Get poiners to name and Seq: extNamSeq( buf, pNam, pSeq ); if( iSeq == 0) { iSeqLen = strlen(pSeq); cout << " - iSeqLen = " << iSeqLen << endl; //getch(); } // Fill Seq Name vector and 1st fragments of Seq Align vector: if( iFrag == 1 ) { f_cutName( pNam ); vSName.push_back( pNam ); #ifdef _TRACE cout << " - " << iSeq << ". SeqName = " << pNam << ", L=" << strlen(pNam) << endl; #endif vAli.push_back( pSeq ); #ifdef _TRACE cout << " - Seq Ali = " << pSeq << endl; #endif // Fill Seq Align vector: } else { // . . . vAli[iSeq] += pSeq; } ++iSeq; } while( ret = fgetsCut( buf, _ALIGN_BUFFER_LENGTH, afile ) ) ; if( iFrag == 1 ) { if( sWLf.length() > 0 ) { cout << " - WL Top " << endl; _GETCH; bWLtop = true; } else if( strlen( buf ) > 2 ) { cout << " - WL Bottom " << endl; _GETCH; bWLbottom = true; } } if( bWLtop ) { sWL += sWLf.substr( pSeq - buf, iSeqLen ); } else if( bWLbottom ) { *(pSeq + iSeqLen) = 0; sWL += string( pSeq ); } } //++cout << "*** WL = [" << sWL << "] len = " << sWL.length() << endl; //getch(); } double f_setWeights( vAliDef& vAli1, const char *matrName, double *sum ) { // returns max_sum // --------------------------------- load WM & recalc //Weight_matrix_pos *aWm = new Weight_matrix_pos; // [ numMatr ]; Weight_matrix *aWm = new Weight_matrix; // [ numMatr ]; FILE *mfile; if( ( mfile = fopen( matrName, "rb" ) ) != NULL && !(*aWm).fread( mfile ) ) { #ifdef _TRACE cout << " - *Recalc OK: " << matrName << endl; #endif fclose( mfile ); } else { cout << "\n* Unable read the matrix: " << matrName << endl;// Some diagnostic. Will be needed only in extraordinar cases... return -1; } double rnadna_frequences[ Max_num_letters ] = // A C D E { 0.25, 0.25, 0.00, 0.00, // F G H I 0.00, 0.25, 0.00, 0.00, // K L M N 0.00, 0.00, 0.00, 0.00, // P Q R S 0.00, 0.00, 0.00, 0.00, // T U V W Y * 0.25, 0.0, 0.00, 0.00, 0.00, 0.00 }; double aa_frequences[ Max_num_letters ] = { 0.085, 0.018, 0.055, 0.066, 0.040, // A,C,D,E,F 0.075, 0.022, 0.059, 0.059, 0.094, // G,H,I,K,L 0.025, 0.044, 0.052, 0.040, 0.050, // M,N,P,Q,R 0.069, 0.060, 0.000, 0.072, 0.014, 0.034, 0.0 // S,T,U,V,W,Y,- }; /*NIK*/ //if( normMatr ) /*NIK*/ // (*aWm).normalize( aa_frequences, aa_frequences ); /*NIK*/ (*aWm).recast( aa_frequences, aa_frequences ); //freq1, freq2 ); /*NIK*/ //int iHomFac = (*aWm).rescale( 0.01 ); //homolFactor ); cout << "*** ww: " << (*aWm).weight( 'S', 'Y' ) << " " << (*aWm).weight( 'G', 'L' ) << endl; int j, nSeq = vAli1.size(), iSeqLen = (vAli1[0]).length(); double sum_min =0.0, sum_max = 0.0; //-sum_min = (*aWm).weight( vAli1[ 0 ][ 0 ], vAli1[ 0 ][ 0 ] ); //++double *sum = new double[ iSeqLen ]; for( j = 0; j < iSeqLen; ++j ) { //double sum = 0.0; sum[j] = 0.0; for( int k1 = 0; k1 < nSeq; k1++ ) for( int k2 = 0; k2 < k1; k2++ ) sum[j] += (*aWm).weight( vAli1[ k1 ][ j ], vAli1[ k2 ][ j ] ); if( j < 10 ) cout << " - sum = " << sum[j] << endl; /** if( sum < 0 ) { cout << " - sum negative = " << sum << ", j = " << j << endl; sum = 0; for( int k1 = 0; k1 < nSeq; k1++ ) for( int k2 = 0; k2 < k1; k2++ ) { sum += (*aWm).weight( vAli1[ k1 ][ j ], vAli1[ k2 ][ j ] ); cout << " " << sum << vAli1[ k1 ][ j ] << "-" << vAli1[ k2 ][ j ]; } cout << endl; getch(); } **/ sum[j] /= ( Matrix_scale*0.5*nSeq*(nSeq-1) ); if( j == 0 || (sum[j] < sum_min) ) sum_min = sum[j]; if( sum[j] > sum_max ) sum_max = sum[j]; if( sum[j] < 0. ) sum[j] = 0.; } cout << " - sum min/max= " << sum_min << ", " << sum_max << endl; //getch(); return sum_max; } // Returns: 0 - protein, 1 - nucleotide. int f_getMolType( vAliDef& vAli ) { int molType; long i, j, nLet=0, nNuc=0; for( i = 0; i < vAli.size(); ++i ) { string& sAli = vAli[0]; for( j = 0; j < sAli.length(); ++j ) { char c = toupper( sAli[j] ); if( ('A' <= c) && (c <= 'Z') ) { ++nLet; if( c == 'A' || c == 'C' || c == 'G' || //c == 'N' || c == 'T' || c == 'U' ) ++nNuc; } } } // *tst nucleotide //molType = ( ( 100*nNuc > 20*nLet ) ? 1 : 0 ) ; molType = ( ( 100*nNuc > 80*nLet ) ? 1 : 0 ) ; //cout << " * Mol Type = " << molType << ", nLet/nNuc = " << nLet << "/" << nNuc << endl; //getch(); return molType; } //EOF