Skip to content
96 changes: 79 additions & 17 deletions mssql_python/pybind/ddbc_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,19 @@
#define DAE_CHUNK_SIZE 8192
#define SQL_MAX_LOB_SIZE 8000

// Returns the effective character decoding encoding for SQL_C_CHAR data.
// On Linux/macOS, the ODBC driver always returns UTF-8 for SQL_C_CHAR,
// having already converted from the server's encoding (e.g., CP1252).
// On Windows, the driver returns bytes in the server's native encoding.
inline std::string GetEffectiveCharDecoding(const std::string& userEncoding) {
#if defined(__APPLE__) || defined(__linux__)
(void)userEncoding;
return "utf-8";
#else
return userEncoding;
#endif
}

//-------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------
// Logging Infrastructure:
Expand Down Expand Up @@ -1154,7 +1167,8 @@ void SqlHandle::markImplicitlyFreed() {
// Log error but don't throw - we're likely in cleanup/destructor path
LOG_ERROR("SAFETY VIOLATION: Attempted to mark non-STMT handle as implicitly freed. "
"Handle type=%d. This will cause handle leak. Only STMT handles are "
"automatically freed by parent DBC handles.", _type);
"automatically freed by parent DBC handles.",
_type);
return; // Refuse to mark - let normal free() handle it
}
_implicitly_freed = true;
Expand Down Expand Up @@ -2876,17 +2890,18 @@ py::object FetchLobColumnData(SQLHSTMT hStmt, SQLUSMALLINT colIndex, SQLSMALLINT
return py::bytes(buffer.data(), buffer.size());
}

// For SQL_C_CHAR data, decode using the specified encoding
// For SQL_C_CHAR data, decode using the appropriate encoding.
const std::string effectiveCharEncoding = GetEffectiveCharDecoding(charEncoding);
py::bytes raw_bytes(buffer.data(), buffer.size());
try {
py::object decoded = raw_bytes.attr("decode")(charEncoding, "strict");
py::object decoded = raw_bytes.attr("decode")(effectiveCharEncoding, "strict");
LOG("FetchLobColumnData: Decoded narrow string with '%s' - %zu bytes -> %zu chars for "
"column %d",
charEncoding.c_str(), buffer.size(), py::len(decoded), colIndex);
effectiveCharEncoding.c_str(), buffer.size(), py::len(decoded), colIndex);
return decoded;
} catch (const py::error_already_set& e) {
LOG_ERROR("FetchLobColumnData: Failed to decode with '%s' for column %d: %s",
charEncoding.c_str(), colIndex, e.what());
effectiveCharEncoding.c_str(), colIndex, e.what());
// Return raw bytes as fallback
return raw_bytes;
}
Expand Down Expand Up @@ -2942,7 +2957,23 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
row.append(
FetchLobColumnData(hStmt, i, SQL_C_CHAR, false, false, charEncoding));
} else {
uint64_t fetchBufferSize = columnSize + 1 /* null-termination */;
// Allocate columnSize * 4 + 1 on ALL platforms (no #if guard).
//
// Why this differs from SQLBindColums / FetchBatchData:
// Those two functions use #if to apply *4 only on Linux/macOS,
// because on Windows with a non-UTF-8 collation (e.g. CP1252)
// each character occupies exactly 1 byte, so *1 suffices and
// saves memory across the entire batch (fetchSize × numCols
// buffers).
//
// SQLGetData_wrap allocates a single temporary buffer per
// column per row, so the over-allocation cost is negligible.
// Using *4 unconditionally here keeps the code simple and
// correct on every platform—including Windows with a UTF-8
// collation where multi-byte chars could otherwise cause
// truncation at the exact column boundary (e.g. CP1252 é in
// VARCHAR(10)).
uint64_t fetchBufferSize = columnSize * 4 + 1 /* null-termination */;
std::vector<SQLCHAR> dataBuffer(fetchBufferSize);
SQLLEN dataLen;
ret = SQLGetData_ptr(hStmt, i, SQL_C_CHAR, dataBuffer.data(), dataBuffer.size(),
Expand All @@ -2953,20 +2984,23 @@ SQLRETURN SQLGetData_wrap(SqlHandlePtr StatementHandle, SQLUSMALLINT colCount, p
uint64_t numCharsInData = dataLen / sizeof(SQLCHAR);
if (numCharsInData < dataBuffer.size()) {
// SQLGetData will null-terminate the data
// Use Python's codec system to decode bytes with specified encoding
// Use Python's codec system to decode bytes.
const std::string decodeEncoding =
GetEffectiveCharDecoding(charEncoding);
py::bytes raw_bytes(reinterpret_cast<char*>(dataBuffer.data()),
static_cast<size_t>(dataLen));
try {
py::object decoded =
raw_bytes.attr("decode")(charEncoding, "strict");
raw_bytes.attr("decode")(decodeEncoding, "strict");
row.append(decoded);
LOG("SQLGetData: CHAR column %d decoded with '%s', %zu bytes "
"-> %zu chars",
i, charEncoding.c_str(), (size_t)dataLen, py::len(decoded));
i, decodeEncoding.c_str(), (size_t)dataLen,
py::len(decoded));
} catch (const py::error_already_set& e) {
LOG_ERROR(
"SQLGetData: Failed to decode CHAR column %d with '%s': %s",
i, charEncoding.c_str(), e.what());
i, decodeEncoding.c_str(), e.what());
// Return raw bytes as fallback
row.append(raw_bytes);
}
Expand Down Expand Up @@ -3453,7 +3487,14 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column
// TODO: handle variable length data correctly. This logic wont
// suffice
HandleZeroColumnSizeAtFetch(columnSize);
// Use columnSize * 4 + 1 on Linux/macOS to accommodate UTF-8
// expansion. The ODBC driver returns UTF-8 for SQL_C_CHAR where
// each character can be up to 4 bytes.
#if defined(__APPLE__) || defined(__linux__)
uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/;
#else
uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
#endif
// TODO: For LONGVARCHAR/BINARY types, columnSize is returned as
// 2GB-1 by SQLDescribeCol. So fetchBufferSize = 2GB.
// fetchSize=1 if columnSize>1GB. So we'll allocate a vector of
Expand Down Expand Up @@ -3601,7 +3642,8 @@ SQLRETURN SQLBindColums(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& column
// TODO: Move to anonymous namespace, since it is not used outside this file
SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& columnNames,
py::list& rows, SQLUSMALLINT numCols, SQLULEN& numRowsFetched,
const std::vector<SQLUSMALLINT>& lobColumns) {
const std::vector<SQLUSMALLINT>& lobColumns,
const std::string& charEncoding = "utf-8") {
LOG("FetchBatchData: Fetching data in batches");
SQLRETURN ret = SQLFetchScroll_ptr(hStmt, SQL_FETCH_NEXT, 0);
if (ret == SQL_NO_DATA) {
Expand Down Expand Up @@ -3631,8 +3673,22 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
std::find(lobColumns.begin(), lobColumns.end(), col + 1) != lobColumns.end();
columnInfos[col].processedColumnSize = columnInfos[col].columnSize;
HandleZeroColumnSizeAtFetch(columnInfos[col].processedColumnSize);
// On Linux/macOS, the ODBC driver returns UTF-8 for SQL_C_CHAR where
// each character can be up to 4 bytes. Must match SQLBindColums buffer.
#if defined(__APPLE__) || defined(__linux__)
SQLSMALLINT dt = columnInfos[col].dataType;
bool isCharType = (dt == SQL_CHAR || dt == SQL_VARCHAR || dt == SQL_LONGVARCHAR);
if (isCharType) {
columnInfos[col].fetchBufferSize = columnInfos[col].processedColumnSize * 4 +
1; // *4 for UTF-8, +1 for null terminator
} else {
columnInfos[col].fetchBufferSize =
columnInfos[col].processedColumnSize + 1; // +1 for null terminator
}
#else
columnInfos[col].fetchBufferSize =
columnInfos[col].processedColumnSize + 1; // +1 for null terminator
#endif
}

// Performance: Build function pointer dispatch table (once per batch)
Expand All @@ -3642,13 +3698,18 @@ SQLRETURN FetchBatchData(SQLHSTMT hStmt, ColumnBuffers& buffers, py::list& colum
std::vector<ColumnProcessor> columnProcessors(numCols);
std::vector<ColumnInfoExt> columnInfosExt(numCols);

// Compute effective char encoding once for the batch (same for all columns)
const std::string effectiveCharEnc = GetEffectiveCharDecoding(charEncoding);

for (SQLUSMALLINT col = 0; col < numCols; col++) {
// Populate extended column info for processors that need it
columnInfosExt[col].dataType = columnInfos[col].dataType;
columnInfosExt[col].columnSize = columnInfos[col].columnSize;
columnInfosExt[col].processedColumnSize = columnInfos[col].processedColumnSize;
columnInfosExt[col].fetchBufferSize = columnInfos[col].fetchBufferSize;
columnInfosExt[col].isLob = columnInfos[col].isLob;
columnInfosExt[col].charEncoding = effectiveCharEnc;
columnInfosExt[col].isUtf8 = (effectiveCharEnc == "utf-8");

// Map data type to processor function (switch executed once per column,
// not per cell)
Expand Down Expand Up @@ -4094,7 +4155,8 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)(intptr_t)fetchSize, 0);
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, &numRowsFetched, 0);

ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns);
ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns,
charEncoding);
if (!SQL_SUCCEEDED(ret) && ret != SQL_NO_DATA) {
LOG("FetchMany_wrap: Error when fetching data - SQLRETURN=%d", ret);
return ret;
Expand All @@ -4103,10 +4165,10 @@ SQLRETURN FetchMany_wrap(SqlHandlePtr StatementHandle, py::list& rows, int fetch
// Reset attributes before returning to avoid using stack pointers later
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)1, 0);
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, NULL, 0);

// Unbind columns to allow subsequent fetchone() calls to use SQLGetData
SQLFreeStmt_ptr(hStmt, SQL_UNBIND);

return ret;
}

Expand Down Expand Up @@ -4231,8 +4293,8 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows,
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, &numRowsFetched, 0);

while (ret != SQL_NO_DATA) {
ret =
FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns);
ret = FetchBatchData(hStmt, buffers, columnNames, rows, numCols, numRowsFetched, lobColumns,
charEncoding);
if (!SQL_SUCCEEDED(ret) && ret != SQL_NO_DATA) {
LOG("FetchAll_wrap: Error when fetching data - SQLRETURN=%d", ret);
return ret;
Expand All @@ -4242,7 +4304,7 @@ SQLRETURN FetchAll_wrap(SqlHandlePtr StatementHandle, py::list& rows,
// Reset attributes before returning to avoid using stack pointers later
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROW_ARRAY_SIZE, (SQLPOINTER)1, 0);
SQLSetStmtAttr_ptr(hStmt, SQL_ATTR_ROWS_FETCHED_PTR, NULL, 0);

// Unbind columns to allow subsequent fetchone() calls to use SQLGetData
SQLFreeStmt_ptr(hStmt, SQL_UNBIND);

Expand Down
47 changes: 38 additions & 9 deletions mssql_python/pybind/ddbc_bindings.h
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,8 @@ struct ColumnInfoExt {
SQLULEN processedColumnSize;
uint64_t fetchBufferSize;
bool isLob;
bool isUtf8; // Pre-computed from charEncoding (avoids string compare per cell)
std::string charEncoding; // Effective decoding encoding for SQL_C_CHAR data
};

// Forward declare FetchLobColumnData (defined in ddbc_bindings.cpp) - MUST be
Expand Down Expand Up @@ -811,21 +813,48 @@ inline void ProcessChar(PyObject* row, ColumnBuffers& buffers, const void* colIn
// fetchBufferSize includes null-terminator, numCharsInData doesn't. Hence
// '<'
if (!colInfo->isLob && numCharsInData < colInfo->fetchBufferSize) {
// Performance: Direct Python C API call - create string from buffer
PyObject* pyStr = PyUnicode_FromStringAndSize(
reinterpret_cast<char*>(
&buffers.charBuffers[col - 1][rowIdx * colInfo->fetchBufferSize]),
numCharsInData);
const char* dataPtr = reinterpret_cast<char*>(
&buffers.charBuffers[col - 1][rowIdx * colInfo->fetchBufferSize]);
PyObject* pyStr = nullptr;
#if defined(__APPLE__) || defined(__linux__)
// On Linux/macOS, ODBC driver returns UTF-8 — PyUnicode_FromStringAndSize
// expects UTF-8, so this is correct and fast.
pyStr = PyUnicode_FromStringAndSize(dataPtr, numCharsInData);
#else
// On Windows, ODBC driver returns bytes in the server's native encoding.
// For UTF-8, use the direct C API (PyUnicode_FromStringAndSize) which
// bypasses the codec registry for maximum reliability. For non-UTF-8
// encodings (e.g., CP1252), use PyUnicode_Decode with the codec registry.
if (colInfo->isUtf8) {
pyStr = PyUnicode_FromStringAndSize(dataPtr, numCharsInData);
} else {
pyStr =
PyUnicode_Decode(dataPtr, numCharsInData, colInfo->charEncoding.c_str(), "strict");
}
#endif
if (!pyStr) {
Py_INCREF(Py_None);
PyList_SET_ITEM(row, col - 1, Py_None);
// Decode failed — fall back to returning raw bytes (consistent with
// FetchLobColumnData and SQLGetData_wrap which also return raw bytes
// on decode failure instead of silently converting to None).
PyErr_Clear();
PyObject* pyBytes = PyBytes_FromStringAndSize(dataPtr, numCharsInData);
if (pyBytes) {
PyList_SET_ITEM(row, col - 1, pyBytes);
} else {
PyErr_Clear();
Py_INCREF(Py_None);
PyList_SET_ITEM(row, col - 1, Py_None);
}
} else {
PyList_SET_ITEM(row, col - 1, pyStr);
}
} else {
// Slow path: LOB data requires separate fetch call
PyList_SET_ITEM(row, col - 1,
FetchLobColumnData(hStmt, col, SQL_C_CHAR, false, false).release().ptr());
PyList_SET_ITEM(
row, col - 1,
FetchLobColumnData(hStmt, col, SQL_C_CHAR, false, false, colInfo->charEncoding)
.release()
.ptr());
}
}

Expand Down
1 change: 1 addition & 0 deletions tests/test_013_encoding_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -5697,6 +5697,7 @@ def test_default_encoding_behavior_validation(conn_str):
def test_encoding_with_bytes_and_bytearray_parameters(db_connection):
"""Test encoding with bytes and bytearray parameters (SQL_C_CHAR path)."""
db_connection.setencoding(encoding="utf-8", ctype=mssql_python.SQL_CHAR)
db_connection.setdecoding(mssql_python.SQL_CHAR, encoding="utf-8")

cursor = db_connection.cursor()
try:
Expand Down
Loading
Loading