#ifdef WIN32 // Visual C++ applies the wrong scope to for loop counters # define for if (1) for // Visual C++ blathers about long debug symbols # pragma warning(disable: 4786) #endif #include "Python.h" #include #include #include typedef std::pair Pair; typedef std::map Similarity; const char *BadKey = "dictionary key must be tuple of two characters"; const char *MissingKey = "no score for '%c' vs. '%c'"; // // makeMatrix // Convert a Python similarity dictionary into a C++ similarity map // // The keys in the Python dictionary must be 2-tuples of // single character strings, each representing a base type. // This routine does not automatically generate the inverse // pair (i.e., does not generate (b,a) when given (a,b) so // the caller must provide the full matrix instead of just // the upper or lower half. The values in the dictionary // must be floating point numbers. // static int makeMatrix(PyObject *dict, Similarity &matrix) { if (!PyDict_Check(dict)) { PyErr_SetString(PyExc_TypeError, "matrix must be a dictionary"); return -1; } PyObject *key, *value; int pos = 0; while (PyDict_Next(dict, &pos, &key, &value)) { // // Verify key type // if (!PyTuple_Check(key) || PyTuple_Size(key) != 2) { PyErr_SetString(PyExc_TypeError, BadKey); return -1; } PyObject *pk0 = PyTuple_GetItem(key, 0); if (!PyString_Check(pk0)) { PyErr_SetString(PyExc_TypeError, BadKey); return -1; } char *k0 = PyString_AsString(pk0); if (strlen(k0) != 1) { PyErr_SetString(PyExc_TypeError, BadKey); return -1; } PyObject *pk1 = PyTuple_GetItem(key, 1); if (!PyString_Check(pk1)) { PyErr_SetString(PyExc_TypeError, BadKey); return -1; } char *k1 = PyString_AsString(pk1); if (strlen(k1) != 1) { PyErr_SetString(PyExc_TypeError, BadKey); return -1; } // // Verify value type // if (!PyFloat_Check(value)) { PyErr_SetString(PyExc_TypeError, "dictionary value must be float"); return -1; } double v = PyFloat_AsDouble(value); // // Store in C++ map // matrix[Pair(*k0, *k1)] = v; } return 0; } // // score // Compute the score of the best Smith-Waterman alignment // // The Python function takes five arguments: // seq1 first sequence // seq2 second sequence // matrix similarity dictionary (see above) // gapOpen gap opening penalty // gapExtend gap extension penalty // and returns the best score. This function is mostly // useful for optimizing similarity matrices. // extern "C" { static PyObject * score(PyObject *self, PyObject *args) { char *seq1, *seq2; PyObject *m; double gapOpen, gapExtend; if (!PyArg_ParseTuple(args, "ssOdd", &seq1, &seq2, &m, &gapOpen, &gapExtend)) return NULL; Similarity matrix; if (makeMatrix(m, matrix) < 0) return NULL; int rows = strlen(seq1) + 1; int cols = strlen(seq2) + 1; double **H = new double *[rows]; for (int i = 0; i < rows; ++i) { H[i] = new double[cols]; for (int j = 0; j < cols; ++j) H[i][j] = 0; } double bestScore = 0; for (int i = 1; i < rows; ++i) { for (int j = 1; j < cols; ++j) { Similarity::iterator it = matrix.find(Pair(seq1[i - 1], seq2[j - 1])); if (it == matrix.end()) { char buf[80]; (void) sprintf(buf, MissingKey, seq1[i - 1], seq2[j - 1]); PyErr_SetString(PyExc_KeyError, buf); return NULL; } double best = H[i - 1][j - 1] + (*it).second; for (int k = 1; k < i; ++k) { double score = H[i - k][j] - gapOpen - k * gapExtend; if (score > best) best = score; } for (int l = 1; l < j; ++l) { double score = H[i][j - l] - gapOpen - l * gapExtend; if (score > best) best = score; } if (best < 0) best = 0; H[i][j] = best; if (best > bestScore) bestScore = best; } } for (int i = 0; i < rows; ++i) delete [] H[i]; delete H; return PyFloat_FromDouble(bestScore); } // // align // Compute the best Smith-Waterman score and alignment // // The Python function takes five arguments: // seq1 first sequence // seq2 second sequence // matrix similarity dictionary (see above) // gapOpen gap opening penalty // gapExtend gap extension penalty // and returns the best score and the alignment. // The alignment is represented by a 2-tuple of strings, // where the first and second elements of the tuple // represent bases (and gaps) from seq1 and seq2 respectively. // static PyObject * align(PyObject *self, PyObject *args) { char *seq1, *seq2; PyObject *m; double gapOpen, gapExtend; if (!PyArg_ParseTuple(args, "ssOdd", &seq1, &seq2, &m, &gapOpen, &gapExtend)) return NULL; // // Convert Python similarity dictionary into C++ similarity map // Similarity matrix; if (makeMatrix(m, matrix) < 0) return NULL; int rows = strlen(seq1) + 1; int cols = strlen(seq2) + 1; // // Allocate space for the score matrix and the backtracking matrix // double **H = new double *[rows]; int **bt = new int *[rows]; for (int i = 0; i < rows; ++i) { H[i] = new double[cols]; bt[i] = new int[cols]; for (int j = 0; j < cols; ++j) { H[i][j] = 0; bt[i][j] = 0; } } // // Fill in all cells of the score matrix // double bestScore = 0; int bestRow = 0, bestColumn = 0; for (int i = 1; i < rows; ++i) { for (int j = 1; j < cols; ++j) { // // Start with the matching score // Similarity::iterator it = matrix.find(Pair(seq1[i - 1], seq2[j - 1])); if (it == matrix.end()) { char buf[80]; (void) sprintf(buf, MissingKey, seq1[i - 1], seq2[j - 1]); PyErr_SetString(PyExc_KeyError, buf); return NULL; } double best = H[i - 1][j - 1] + (*it).second; int op = 0; // // Check if insertion is better // for (int k = 1; k < i; ++k) { double score = H[i - k][j] - gapOpen - k * gapExtend; if (score > best) { best = score; op = k; } } // // Check if deletion is better // for (int l = 1; l < j; ++l) { double score = H[i][j - l] - gapOpen - l * gapExtend; if (score > best) { best = score; op = -l; } } // // Check if this is just a bad place // to start/end an alignment // if (best < 0) { best = 0; op = 0; } // // Save the best score in the score Matrix // H[i][j] = best; bt[i][j] = op; if (best > bestScore) { bestScore = best; bestRow = i; bestColumn = j; } } } // // Use the backtrack matrix to create the best alignment // std::string a1, a2; while (H[bestRow][bestColumn] > 0) { int op = bt[bestRow][bestColumn]; if (op > 0) { for (int k = 0; k < op; ++k) { --bestRow; a1.append(1, seq1[bestRow]); a2.append(1, '-'); } } else if (op == 0) { --bestRow; --bestColumn; a1.append(1, seq1[bestRow]); a2.append(1, seq2[bestColumn]); } else { op = -op; for (int k = 0; k < op; ++k) { --bestColumn; a1.append(1, '-'); a2.append(1, seq2[bestColumn]); } } } std::reverse(a1.begin(), a1.end()); std::reverse(a2.begin(), a2.end()); PyObject *alignment = PyTuple_New(2); PyTuple_SetItem(alignment, 0, PyString_FromString(a1.c_str())); PyTuple_SetItem(alignment, 1, PyString_FromString(a2.c_str())); // // Release the score and backtrack matrix memory // for (int i = 0; i < rows; ++i) { delete [] H[i]; delete [] bt[i]; } delete H; delete bt; // // Return our results // return Py_BuildValue("fO", bestScore, alignment); } } static PyMethodDef sw_methods[] = { // "sw" is around for historical reasons { "sw", score, 1, "Smith-Waterman scoring" }, { "score", score, 1, "Smith-Waterman scoring" }, { "align", align, 1, "Smith-Waterman alignment" }, { NULL, NULL } }; extern "C" void initsw() { Py_InitModule("sw", sw_methods); }