// - Dothelix motifs' collection maps: *20.12.2k // For each matrix: // - graphic if any // - its map msgs // - motifs "text" output if any // // All options in parm file. Its name (default "parm.txt") as arg 1 of main(). // All other file names in parm file (w/o defaults): // - Input // - Seq1 & Seq2 // - Matrix file(s) // - Output // - Image (PNG) file(s) // - Map msgs file (one to all matrices) // - Motifs' output (one to all matrices) // // Update history: // // 20.02.01 - (motif_map.h): align seq names in image map (append smaller seq with spaces); // - matrix name set to upper case in sOptLine. And drop the matrix desc; // - full color line instead of 4 color legends // // 10.04.01 - green/blue of seq1/seq2 in motifOutput(). // ------------------------- a few debug flags #define _GETCH //getch() #define _TRACE // ------------------------- #include //?#define min //?#define max // -------------------- #include #include //++ g++? #include #ifndef _MSC_VER // unistd.h ? #include #endif #include #include #include // ? #include // ------------------- GD components #include "gd.h" #include "gdfontg.h" #include "gdfonts.h" #include "gdfontt.h" // ------------------- SGI STL #include #include //#include // ------------------- __NEW logic (like swap logic) added to // swap - to get around that guy in #define __NEW #define swap // --------------------- GeneBee's //? #define index index_sqrt #include "inv_sqrt.h" //? #undef index #include "const.h" #include "work_seq.h" #include "dothelix.h" #include "seqbase.h" #include "w_matrix.h" #include "seqbase.cpp" #include "inv_sqrt.cpp" #include "motif.cpp" #include "params.cpp" #include "dothelix.cpp" // -------------------------------------------------- // Children & other new classes: #include "w_matrix_pos.h" #include "twoseq_progress.h" // ---------------------------------------- Constants / bounds // Upper bound of the plotted motifs ??? #define _PLOTTED_MAX 1000 // Upper bound of the printed motifs ??? #define _PRINTED_MAX 200 // Upper bound of the (DotHelix & any) power // (mind integer limit also) #define _POWER_MAX 500. // Image iW & iH #define _IMAGE_WIDTH 640 #define _IMAGE_HEIGHT 480 + 16 + 16 + 8 // Opt line upper bound (right side of the graphic) #define _OPTLINE_MAX 85 // Upper bound of the size of Seq name to be plotted at (0,0) #define _SEQ_NAME_LIMIT 16 // Lower bound of the "sensitive" motifs in graphic #define _SENS_MOTIFS 10 // ------------------------------------- Motifs' Graphics struct motif_opt { int x; // x.left - offset in the 1st seq int y; // y.left - offset in the 2nd seq int len; // motif's length }; #include "motif_map.h" // ------------------------------------- Weight_matrix_pos* GetWeights2( char *matrName ) { // matrName - name of the matrix file. FILE *mfile; Weight_matrix_pos *wm = new Weight_matrix_pos; if( ( mfile = fopen( matrName, "rb" ) ) == NULL || wm->fread( mfile ) ) { printf( "\n Can't read file with Weight matrix %s or can't load it\n" ); return NULL; } fclose( mfile ); return wm; } /**/ int MotifOutput( WorkSeq &wSeq1, int left1, char *name1, WorkSeq &wSeq2, int left2, char *name2, int len, int power, /*NIK*/ Weight_matrix& wm, char* matrName, int numMot, int numGroup, FILE *fOut ) { // Important - matrix MUSt be same as motifs were found... // May be you remember I told about opportunity to choose & use // several matrix. // numMot is useful for user and it marks begin of each motif output // name1 & name2 are come together with sequences: // >seq1Name // jsxhgjhxgfjhsdg.... // >seq2Name // jsxhgjhxgfjhsdg // They should be used in graphic interfase also ... char *markSym = " .+*"; // May be some other signs ... // 1. Get proper segments of the sequences and check natural condition for any case... const char *seq1 = wSeq1.seq() + left1; // left1 is counted from 0 [?] const char *seq2 = wSeq2.seq() + left2; if( wSeq1.len() < left1 + len || wSeq2.len() < left2 + len || !len ) { return 1; // Invalid motifs parameters } // 2. Create sign - string to be output char *sign = new char[ len ]; int i, sum = 0; //0.; for( i = 0; i < len; i++ ) { sign[ i ] = ' '; // The case the same letters - both for protein & nucleotyde cases... if( seq1[ i ] == seq2[ i ] ) { sign[ i ] = seq1[ i ]; sum++; // for homology percent counting } else { // different letters & protein - sign means degree of similarity /*NIK*/ if( strcpy( matrName, "NUCLEOTIDE" ) ) { // MQ_PROTEIN in seqbase.h // coefficient 0.44 is chosen for blosum62 matrix... ??? int idx = ( int )( ( ( wm.orig_elem[ wm.ord( seq1[ i ] ) ][ wm.ord( seq2[ i ] ) ] - wm.mean )/wm.mdisp )/0.44 ); idx = max( 0, min( 3, idx ) ); sign[ i ] = markSym[ idx ]; } // different letters & nucleotide - ' ' } } // 3. Output additional information //double pow = power/( ( double )( Matrix_scale*Norm_L ) ); double pow = power * ( double ) 0.1; // As a rule 0 < pow < 1000. //fprintf( fOut, "

Motif %3d, Power %6.2f Homology percent %5.3f Length %6d

\n", //fprintf( fOut, "

Motif %3d, Power %6.1f Homology percent %5.1f Length %6d

", fprintf( fOut, "

Motif %d-%d, Power %6.1f Homology percent %5.1f Length %6d

\n", numGroup, numMot, pow, 100.*sum/len, len ); //++fprintf( fOut, "Matrix: %s

", matrDesc ); // 4. Output proper motif & sign int from = 0, partLen = 60; // 60 is common *** partLen not used while( from < len ) { int n = min( 60, len - from ); // |<- 20 ->| so 20+60 = 80 // |<- 22 ->| so 20+60 = 80 fprintf( fOut, " %.*s\n", n, sign+from ); // from+left1+1 because of stupid user want to count from 1 // green + blue *** 10.04.2001 /*NIK*/ fprintf( fOut, "%-12s (%4d) %.*s\n", name1, from+left1+1, n, seq1+from ); /*NIK*/ fprintf( fOut, "%-12s (%4d) %.*s\n\n", name2, from+left2+1, n, seq2+from ); from += n; } fprintf( fOut, "\n" ); delete sign; return 0; } void f_fileBase( char *fName, char *buf, int lenMax ) { int iL = strlen( fName ); char *point = strrchr ( fName, '.' ); char *nr = strpbrk( fName, "\n\r" ); if( point == 0 ) point = nr; if( point == 0 ) point = fName + iL; // -> 0 char *slash = strrchr ( fName, '/' ); if( slash == 0 ) slash = strrchr ( fName, '\\' ); char *start = fName; if( slash > 0 ) start = slash + 1; int len = point - start; if( len > lenMax ) len = lenMax; strncpy( buf, start, len ); buf[len] = 0; } void f_toupper( char *s ) { int i, iL = strlen( s ); for( i = 0; i < iL; i++ ) s[i] = toupper( s[i] ); #ifdef _TRACE //+cout << "*** to upper *** iL = " << iL << ", " << s << endl; #endif } int main( int argc, char *argv[]) { int i, j=0; //+char *sMsgFName = "map_msgs.txt"; //+char *sMotFName = "motifs.txt"; char *sMatrixFile = "blos.wmt"; //+char *sMatrixFile = "rna.wmt"; char sMatrixName[130]; //vector< string > aMatrName; //aMatrName[0] = "blos.wmt"; char *matrName[] = { "blos.wmt", "other", "", "", "", "", "", "", "", "" }; char *matrDesc[10]; //{ "blos.wmt", "other" }; int numMatr = 0; char sOptLine[256]; char *cPos; // = strpbrk( sOptLine, "\n\r" ); Homology_info *hiResult0, *hiResult; //Weight_matrix *wm; //Weight_matrix_pos *wm2; FILE *seqFile; FILE *msgFile; FILE *motFile; char *msgFName, *motFName; //char *seq1FName = { "Seq1" }; //char *seq2FName = { "Seq2" }; char seq1FName[256]; // = "Seq1.txt"; char seq2FName[256]; // = "Seq2.txt"; char sPngNamePref[256]; // = "mot10_"; char sPngFullName[256]; // = "mot10_1"; int iLenLow = 7, iPowerLow=0, iNumFound = 0, iAccurate = 1; int iPower0 = 3; int iPower1 = 6; int iPdrop = 0; //int iMfr = 0; int freqRecalc = 0; int normMatr = 0; int molType = MQ_PROTEIN; // 0 - PROT, 1 - DNARNA, 2 - BOTH double dPower0 = 3.; double dPower1 = 6.; double dPdrop = 0.; double homolFactor = 0.01; int iMaxPrint = 10; int iMaxMsg = iMaxPrint; int iMaxPlot = 1000; #ifdef _TRACE for( i = 0; i < argc; i++ ) { cout << "parm = " << argv[i] << endl; } _GETCH; #endif // Get parm file name from command line (if any) & open ------------------- string parmFName( "parm.txt" ); // default = "parm.txt" if( argc > 1 ) parmFName = argv[1]; // get from command line cout << "* parmfile = " << parmFName << endl; //+ifstream ist( "parm.txt" ); ifstream ist( parmFName.c_str() ); //, ios::nocreate ); // s.c_str() or s.data() if( !ist ) { cout << "\nUnable to open parm file: " << parmFName << endl; return 1; } // ---------------------------------------------------------------- char s[130]; // buf char *ps = s; char p0[2] = { 13, 0 }; char *pend, *pw; // Get parms: while (true) { ist.getline( ps, 130); // SI ); if (ist.eof()) break; s[99] = 13; p0[0] = 13; pend = strpbrk( s, p0 ); if( pend > 0 ) { *pend = 0; } //cout << "Parm line: " << string(ps) << endl; if( strncmp( s, "Seq1 ", 5 ) == 0 ) { pw = &s[30]; strcpy(seq1FName, pw); #ifdef _TRACE cout << "* Seq1 File = " << seq1FName << endl; #endif } if( strncmp( s, "Seq2 ", 5 ) == 0 ) { pw = &s[30]; strcpy(seq2FName, pw); #ifdef _TRACE cout << "* Seq2 File = " << seq2FName << endl; #endif } if( strncmp( s, "PNG name pref ", 14 ) == 0 ) { pw = &s[30]; strcpy( sPngNamePref, pw); #ifdef _TRACE cout << "* PNG name pref = " << sPngNamePref << endl; #endif } // Noise Upper Bound if( strncmp( s, "Power ", 6 ) == 0 || strncmp( s, "NoiseUpperBound ", 16 ) == 0 ) { // not used by DotHelix pw = &s[30]; dPower0 = atof(pw) * 10.; iPower0 = (int)dPower0; #ifdef _TRACE cout << "* Power*10 = " << dPower0 << endl; //" = " << iPower0 < _POWER_MAX ) dPdrop = _POWER_MAX; iPowerLow = (int)(dPdrop * (double)Matrix_scale * (double)Norm_L); iPdrop = (int)(dPdrop * 10.); #ifdef _TRACE cout << "* PowerMostLow = " << dPdrop << endl; //" = " << iPdrop << endl; #endif } // Significant power threshold if( strncmp( s, "PowerS ", 7 ) == 0 || strncmp( s, "PowerSigni ", 11 ) == 0 ) { // not used by DotHelix //OUT_SPEC; pw = &s[30]; dPower1 = atof(pw) *10.; iPower1 = (int)dPower1; #ifdef _TRACE cout << "* PowerS*10 = " << (int)dPower1 << endl; #endif } if( strncmp( s, "Length ", 7 ) == 0 ) { //OUT_SPEC; pw = &s[30]; iLenLow = atoi(pw); #ifdef _TRACE cout << "* L = " << iLenLow << endl; #endif } if( strncmp( s, "Accurate ", 9 ) == 0 ) { pw = &s[30]; iAccurate = atoi(pw); #ifdef _TRACE cout << "* Accu = " << iAccurate << endl; #endif } if( strncmp( s, "Frequency ", 10 ) == 0 ) { pw = &s[30]; //iMfr = atoi(pw); freqRecalc = atoi(pw); #ifdef _TRACE cout << "* freqRecalc = " << freqRecalc << endl; #endif } if( strncmp( s, "Normalize ", 10 ) == 0 ) { pw = &s[30]; normMatr = atoi(pw); #ifdef _TRACE cout << "* normMatr = " << normMatr << endl; #endif } if( strncmp( s, "Homology ", 9 ) == 0 ) { pw = &s[30]; homolFactor = atof(pw); #ifdef _TRACE cout << "* Homo = " << homolFactor << endl; #endif } if( strncmp( s, "Print ", 6 ) == 0 ) { pw = &s[30]; iMaxPrint = atoi(pw); iMaxMsg = iMaxPrint; //+if( iMaxMsg < _SENS_MOTIFS ) iMaxMsg = _SENS_MOTIFS; #ifdef _TRACE cout << "* iMaxPrint = " << iMaxPrint << endl; cout << "* iMaxMsg = " << iMaxMsg << endl; #endif } if( strncmp( s, "Plot ", 5 ) == 0 ) { pw = &s[30]; iMaxPlot = atoi(pw); #ifdef _TRACE cout << "* iMaxPlot = " << iMaxPlot << endl; #endif } if( strncmp( s, "Moltype ", 8 ) == 0 ) { pw = &s[30]; molType = atoi(pw); #ifdef _TRACE cout << "* molType = " << molType << endl; #endif } if( strncmp( s, "Matrix ", 7 ) == 0 ) { pw = &s[30]; matrName[numMatr] = new char[strlen( pw ) + 1]; strcpy( matrName[numMatr], pw ); #ifdef _TRACE cout << "* Matrix = " << matrName[numMatr] << endl; #endif numMatr++; } if( strncmp( s, "Map msg file ", 13 ) == 0 ) { // map_msgs.txt pw = &s[30]; msgFName = new char[strlen( pw ) + 1]; strcpy( msgFName, pw ); #ifdef _TRACE cout << "* Image map msg file name = " << msgFName << endl; #endif } if( strncmp( s, "Motifs file ", 12 ) == 0 ) { // motifs.png pw = &s[30]; motFName = new char[strlen( pw ) + 1]; strcpy( motFName, pw ); #ifdef _TRACE cout << "* Motifs file name = " << motFName << endl; #endif } } //return 1; // Parm check: if( numMatr == 0 ) numMatr = 1; if( iMaxMsg < _SENS_MOTIFS ) iMaxMsg = _SENS_MOTIFS; if( iMaxPrint > _PRINTED_MAX ) iMaxPrint = _PRINTED_MAX ; if( iMaxPlot > _PLOTTED_MAX ) iMaxPlot = _PLOTTED_MAX ; #ifdef _TRACE cout << " - molType = " << molType << " (MQ_DNARNA=" < 0 ) *cPos = 0; matrDesc[nMat] = new char[strlen(sMatrixName)+1]; strcpy( matrDesc[nMat], sMatrixName ); fclose( fIn ); #ifdef _TRACE cout << "Matrix desc = " << sMatrixName << "***" << endl; //cout << " = " << matrDesc[nMat] << "***" << endl; #endif // --------------------------------- load & recalc FILE *mfile; if( ( mfile = fopen( matrName[ nMat ], "rb" ) ) != NULL && !aWm[ nMat ].fread( mfile ) ) { #ifdef _TRACE cout << " - *Recalc OK: " << matrName[nMat] << endl; #endif fclose( mfile ); } else { cout << "\n* Unable read the matrix: " << matrName[nMat] << endl;// Some diagnostic. Will be needed only in extraordinar cases... return 1; } } // OK - now all matrices are ready for usage... freq1, freq2 can be deleted ********* ??? // -------------------------------------------------------------------- #ifdef _TRACE // cout << "--- aElem with obj 'TwoSeqProg':" << endl; #endif int iPower, iX, iY; motif_opt mo; multimap< int, motif_opt > mm; // p => x.left, y.left, len multimap< int, motif_opt >::iterator imm; multimap< int, motif_opt >::reverse_iterator imr; // The last motif is the best // Open all files: //msgFile = fopen( sMsgFName, "w" ); //motFile = fopen( sMotFName, "w" ); msgFile = fopen( msgFName, "w" ); motFile = fopen( motFName, "w" ); //+iLen1 = iLen2 = 10; // *test below TwoSeqProgress ii(iLen1, iLen2); /** / // Test of TwoSeqProgress for( ii = iLenLow - 1; ii < (iLen1 + iLen2 - iLenLow); ++ii ) { //+for( ii = 0; ii < 14; ++ii ) { cout << (*ii).i << ". left1 = " << (*ii).left1 << ", left2 = " << (*ii).left2 << ", len = " << (*ii).len << endl; getch(); } return 1; /**/ #ifdef _TRACE // Find power (integer) bound: int _i, _i1, _i2; for( _i = 100; _i < 10000; _i += 100 ) { _i1 = _i * (Matrix_scale * Norm_L); _i2 = _i1 / (Matrix_scale * Norm_L); if( _i2 < _i ) { cout << "***** i-bound = " << _i << " -> " << _i2 << endl; break; } } #endif // ------------------------------------------------- Main Loop: for( nMat = 0; nMat < numMatr; nMat++ ) { /*NIK*/ freq1 = aWm[ nMat ].makeFreq( wSeq1.len(), wSeq1.seq() ); /*NIK*/ freq2 = aWm[ nMat ].makeFreq( wSeq2.len(), wSeq2.seq() ); /*NIK*/ if( normMatr ) /*NIK*/ aWm[ nMat ].normalize( freq1, freq2 ); /*NIK*/ aWm[ nMat ].recast( freq1, freq2 ); /*NIK*/ int iHomFac = aWm[ nMat ].rescale( homolFactor ); /*NIK*/ delete freq1; /*NIK*/ delete freq2; iQ = 0; #ifdef _TRACE cout << "*nMat = " << nMat << endl; #endif // Motif collection separator ******* if( nMat > 0 ) fprintf( motFile, "*next\n" ); //for( ii = iLenLow - 1; ii < (iLen1 + iLen2 - iLenLow - 1); ++ii ) { for( ii = iLenLow - 1; ii < (iLen1 + iLen2 - iLenLow); ++ii ) { #ifdef _TRACE //if( (*ii).left1 == 0 && (*ii).left2 == 0 ) { // cout << "***** (0,0) = " << (*ii).len << endl; //} #endif // Weightening //++ (*wm2).getWeight( wSeq1, (*ii).left1, wSeq2, (*ii).left2, (*ii).len, aElem, 0, 0 ); aWm[nMat].getWeight( wSeq1, (*ii).left1, wSeq2, (*ii).left2, (*ii).len, aElem, iHomFac, 0 ); hiResult = DotHelix( aElem, (*ii).len, iLenLow, iPowerLow, iNumFound, iAccurate ); hiResult0 = hiResult; for( i = 0; i < iNumFound; i++ ) { /*NIK*/ hiResult->sum += iHomFac*hiResult->length; /*NIK*/ hiResult->power = hiResult->sum * inv_sqrt( hiResult->length ); /*NIK*/ if( freqRecalc ) { double mult = 1.0; double add = 0.0; /*NIK*/ hiResult->sum = aWm[nMat].recalcSum( hiResult->length, /*NIK*/ wSeq1.seq() + hiResult->left + (*ii).left1, /*NIK*/ wSeq2.seq() + hiResult->left + (*ii).left2, /*NIK*/ hiResult->sum, 0, mult, add ); /*NIK*/ hiResult->power = hiResult->sum*inv_sqrt( hiResult->length ); /*NIK*/ } iQ++; iPower = (int)( ((*hiResult).power * 10.) / (double)(Matrix_scale * Norm_L) ); if( iPower < 0 ) { cout << " * (<0) iPower = " << iPower << ", " << (*hiResult).power << endl; //iPower = (iPowerLow * 10) / (Matrix_scale * Norm_L); } iX = (*hiResult).left + (*ii).left1; iY = (*hiResult).left + (*ii).left2; #ifdef _TRACE if( iX == iY ) { cout << " - (" << iX << "," << iY << ") = (L, p, P) = " << (*ii).len << ", " << iPower << ", " << (*hiResult).power << endl; } #endif /** cout << iQ << "." << (*ii).len << "." << i << ". Pos = " << (*hiResult).left << " Len = " << (*hiResult).length << " Power*10 = " << ((*hiResult).power * 10) / (Matrix_scale * Norm_L) << " X = " << (*hiResult).left + (*ii).left1 << " Y = " << (*hiResult).left + (*ii).left2 //<< " Sum = " << (*hiResult).sum << endl; **/ mo.x = iX; mo.y = iY; mo.len = (*hiResult).length; mm.insert( pair< const int, motif_opt >( (const int)iPower, mo ) ); hiResult++; } delete[] hiResult0; // *** #ifdef _TRACE if( iNumFound > 0 ) { //++cout << " - Found = " << iNumFound << endl; _GETCH; //getch(); } #endif } //cout << " Found = " << iQ << endl; #ifdef _TRACE cout << " - Motifs: " << iQ << endl; _GETCH; #endif iQQ += iQ; //char sOptLine[256]; f_fileBase( matrName[nMat], s, 29 ); // s[130] - buf f_toupper( s ); //sprintf( sOptLine, "Plot=%d,Len=%d,Accu=%d,Freq=%d,Norm=%d,Homo=%.3f, %s: %s\0", //iMaxPlot, iLenLow, iAccurate, freqRecalc, normMatr, homolFactor, s, matrDesc[nMat]); sprintf( sOptLine, "Plot=%d,Len=%d,Accu=%d,Freq=%d,Norm=%d,Homo=%.3f, %s\0" , iMaxPlot, iLenLow, iAccurate, freqRecalc, normMatr, homolFactor, s ); //if( strlen( sOptLine ) > 85 ) sOptLine[85] = 0; if( strlen( sOptLine ) > _OPTLINE_MAX ) sOptLine[_OPTLINE_MAX] = 0; #ifdef _TRACE cout << " - OptLine = " << sOptLine << endl; #endif // ----------------------------------------------------------- MAP sprintf( sPngFullName, "%s%d.png\0", sPngNamePref, nMat ); #ifdef _TRACE cout << " - PNG = " << sPngFullName << ", iPref = " << nMat << endl; //return 1; #endif fprintf( msgFile, "\n", nMat ); //iMsgPref ); if( iMaxPlot > 0 ) { // iW iH //MotifMap( mm, iPdrop,iPower0, iPower1, 640, 480 + 16 + 16 + 8, MotifMap( mm, iPdrop,iPower0, iPower1, _IMAGE_WIDTH, _IMAGE_HEIGHT, wSeq1, wSeq2, seq1Buf, seq2Buf, sOptLine, msgFile, iMaxPlot, iMaxMsg, sPngFullName, nMat ); } fprintf( msgFile, "\n" ); _GETCH; // ------------------------------------------------------------ Motifs' List //int molType = MQ_PROTEIN; //if( freqRecalc == 0 ) molType = MQ_DNARNA; iQ = 0; if( iMaxPrint < 1 ) { fprintf( motFile, "

Motifs output is not requested.

\n" ); } else { for( imr = mm.rbegin(); imr != mm.rend(); imr++ ) { iQ++; if( (*imr).first < iPower0 ) break; fprintf( motFile, " \n", nMat, iQ ); //iMsgPref, iQ ); //+MotifOutput( wSeq1, ((*imr).second).x, ">Seq1", wSeq2, ((*imr).second).y, ">Seq2", MotifOutput( wSeq1, ((*imr).second).x, seq1Buf, wSeq2, ((*imr).second).y, seq2Buf, ((*imr).second).len, (*imr).first, (Weight_matrix&) /*NIK*/ aWm[nMat], sMatrixName, iQ, nMat + 1, motFile ); // 1 //++(Weight_matrix&) aWm[0], molType, iQ, motFile ); // 1 //+(Weight_matrix&) aWm[0], 1, iQ, motFile ); // 1 #ifdef _TRACE // cout << "iQ = " << iQ << endl; #endif if( iQ >= iMaxPrint ) break; } // Not found: if( iQ < 1 ) { fprintf( motFile, "

Motifs Are Not Found. Try Smaller Thresholds.

\n" ); } } // Print requested //fclose( motFile ); #ifdef _TRACE cout << " - iPowerLow = " << iPowerLow << ", Factor = " << Matrix_scale * Norm_L << ", iPdrop*10 = " << iPdrop << endl; #endif mm.clear(); } // end of matr ----------------------------------------------- end of Main Loop // Close all files: fclose( msgFile ); fclose( motFile ); // Delete all: //delete[] matrName; //delete[] matrDesc; #ifdef _TRACE cout << "- Motifs' Total: " << iQQ << endl; #endif return 0; } // EOF