服务器 频道

DB2中如何实现正则表达式

  清单 9. 实现表函数将要使用的 regex3 函数

#include <pcre.h>
#include <sqludf.h>
#include <sqlstate.h>
#include <string.h>
struct scratchPadMapping {
    pcre *re;
    pcre_extra *extra;
    const char *error;
    int errOffset;
    int numSubstr;
    int *substr;
    int currentSubstr;
};
void regexpSubstr(
    // input parameters
    SQLUDF_VARCHAR *pattern,      SQLUDF_CLOB *str,
    // output
    SQLUDF_INTEGER *pos,          SQLUDF_VARCHAR *substr,
    // null indicators
    SQLUDF_NULLIND *pattern_ind,  SQLUDF_NULLIND *str_ind,
    SQLUDF_NULLIND *pos_ind,      SQLUDF_NULLIND *substr_ind,
    SQLUDF_TRAIL_ARGS_ALL) // SQLUDF_SCRAT & SQLUDF_CALLT
{
    int rc = 0;
    size_t length = 0;
    struct scratchPadMapping *scratch = NULL;
    // map the buffer of the scratchpad and assume NULL return
    scratch = (struct scratchPadMapping *)SQLUDF_SCRAT->data;
    *pos_ind = 0;
    *substr_ind = 0;
    switch (SQLUDF_CALLT) {
      case SQLUDF_TF_OPEN:
        // initialize data on the scratchpad
        scratch->re = NULL;
        scratch->extra = NULL;
        scratch->error = NULL;
        scratch->errOffset = 0;
        scratch->numSubstr = 0;
        scratch->substr = NULL;
        scratch->currentSubstr = 1; // skip the complete match
        // compile the pattern (only in the FIRST call
        scratch->re = pcre_compile(pattern, 0 /* default options */,
            &scratch->error, &scratch->errOffset, NULL);
        if (scratch->re == NULL) {
            snprintf(SQLUDF_MSGTX, 70, "Regexp compilation failed at "
                "offset %d: %s\\n", scratch->errOffset, scratch->error);
            strcpy(SQLUDF_STATE, "38900");
            rc = -1;
            break;
        }
        // further analyze the pattern (might return NULL)
        scratch->extra = pcre_study(scratch->re,
            0 /* default options */, &scratch->error);
        // determine the number of capturing subpatterns
        rc = pcre_fullinfo(scratch->re, scratch->extra,
            PCRE_INFO_CAPTURECOUNT, &scratch->numSubstr);
        if (rc) {
            snprintf(SQLUDF_MSGTX, 70, "Could not retrieve info "
                "on pattern. (rc = %d)", rc);
            strcpy(SQLUDF_STATE, "38901");
            rc = -1;
            break;
        }
        // allocate memory for the substring indices
        {
            int size = (scratch->numSubstr+1)*3;
            scratch->substr = (int *)malloc(size * sizeof(int));
            if (!scratch->substr) {
                snprintf(SQLUDF_MSGTX, 70, "Could allocate memory for "
                    "substring indices.");
                strcpy(SQLUDF_STATE, "38902");
                rc = -1;
                break;
            }
            memset(scratch->substr, 0, size * sizeof(int));
            // match the current string
            rc = pcre_exec(scratch->re, scratch->extra, str->data,
            str->length, 0, 0 /* default options */,
                scratch->substr, size);
        }
        switch (rc) {
          case PCRE_ERROR_BADOPTION:
            snprintf(SQLUDF_MSGTX, 70, "An unrecognized bit was set "
                "in the options argument");
            strcpy(SQLUDF_STATE, "38903");
            rc = -1;
            break;
          case PCRE_ERROR_NOMEMORY:
            snprintf(SQLUDF_MSGTX, 70, "Not enough memory available.");
            strcpy(SQLUDF_STATE, "38904");
            rc = -1;
            break;
          case PCRE_ERROR_NOMATCH:
            scratch->currentSubstr = scratch->numSubstr + 1;
            rc = 0;
            break;
          default:
            if (rc < 0) {
                snprintf(SQLUDF_MSGTX, 70, "A regexp match error "
                    "occured: %d", rc);
                strcpy(SQLUDF_STATE, "38905");
                rc = -1;
                break;
            }
        }
        break;
      case SQLUDF_TF_FETCH:
        // skip capturing substrings without a match
        while (scratch->currentSubstr <= scratch->numSubstr &&
            (scratch->substr[2*scratch->currentSubstr] < 0 ||
                scratch->substr[2*scratch->currentSubstr+1] < 0)) {
            scratch->currentSubstr++;
        }
        // no more data to be returned
        if (scratch->currentSubstr > scratch->numSubstr) {
            strcpy(SQLUDF_STATE, SQL_NODATA_EXCEPTION);
            rc = 0;
            break;
        }
        // get the current substring
        *pos = scratch->currentSubstr;
        length = scratch->substr[2*scratch->currentSubstr+1] -
            scratch->substr[2*scratch->currentSubstr];
        strncpy(substr, str->data + scratch->substr[2*scratch->currentSubstr],
            length);
        substr[length] = ''\\0'';
        scratch->currentSubstr++;
    }
    // cleanup in CLOSE call, or if we encountered an error in
    // the OPEN call (DB2 will make a CLOSE call if we encounter
    // an error in any FETCH call)
    if (SQLUDF_CALLT == SQLUDF_TF_CLOSE ||
        (SQLUDF_CALLT == SQLUDF_TF_OPEN && rc < 0)) {
        (*pcre_free)(scratch->re);
        (*pcre_free)(scratch->extra);
        free(scratch->substr);
    }
    return;
}

  正如我们对基本匹配函数所做的那样,您也可使用 FINAL CALL 定义表函数来改进性能。必须修改 C 代码以处理 SQLUDF_TF_FIRST 和 SQLUDF_TF_FINAL 调用。

  用法示例

  表函数可以在类似于如下所示的 SELECT 语句中使用:


SELECT c1, str, num, substr
FROM   strTable,
       TABLE ( regex3(''(\\w+)\\s*=\\s*(\\d+|''''[\\w\\s]*'''');'', str) ) AS sub(num, substr)

  结果只包括拥有匹配模式的字符串。对于每个字符串,都在单独的行中显示所捕获的第一个和第二个子串。


C1          2                              NUM         4
----------- ------------------------------ ----------- -----------------------
          2 variable = 1234;                         1 variable
          2 variable = 1234;                         2 1234
          3 var2 = ''string variable'';                1 var2
          3 var2 = ''string variable'';                2 ''string variable''
  4 record(s) selected.

  下一个查询在结果集中用单独的列而不是单独的行返回这两个子串对。这样,用 SQL 语句进一步处理这些字符串以及它们的子串要容易些。该查询使用了公共表表达式(由关键字 WITH 表示)来确保在整个查询中只对每个字符串进行一次求值,而不是在中间表 s1s2 所需的每次子选择中都进行一次求值。


WITH substrings(c, num, substr) AS
   ( SELECT c1, num, substr
     FROM   strTable,
            TABLE ( regex3(''(\\w+)\\s*=\\s*(\\d+|''''[\\w\\s]*'''');'', str) )
               AS sub(num, substr) )
SELECT t.c1, s1.substr AS variable, s2.substr AS value
FROM   strTable AS t JOIN substrings AS s1 ON
          ( t.c1 = s1.c ) JOIN
       substrings AS s2 ON
          ( t.c1 = s2.c )
WHERE  s1.num = 1 AND s2.num = 2

  这里使用了与前面查询中相同的模式。因此,可以从上面的表中派生出结果,但这一次,根据请求,这些 variable-value 对每个都位于单独的行中。


C1          VARIABLE                       VALUE
----------- ------------------------------ --------------------
          2 variable                       1234
          3 var2                           ''string variable''
  2 record(s) selected. 

  结束语

  在本文中,我相当简略地介绍了正则表达式以及 DB2 中可用的字符串比较和匹配功能。我还描述了为什么正则表达式的强大功能是如此有用。UDF 可以用来以两种形式在 DB2 中提供正则表达式。在第一种形式中,通过将字符串与给定模式进行比较来完成基本匹配。第二种形式是实现表函数,它从正则表达式抽取已捕获的子串,并将这些子串返回给 DB2,以便用在 SQL 语句的进一步处理中。我还给出了一些关于如何改进性能的提示。

0
相关文章