How to do regex string replacements in pure C?

The PCRE library itself does not provide a replace function, but there is a wrapper function available at the PCRE downloads page that accepts perl style =~ s/pattern/replace/ syntax and then uses the PCRE native functions to do a substitute/replace for you. Go to http://www.pcre.org/ then click on the Download link: ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/, then the Contrib directory. The package/project you want is: pcrs-0.0.3-src.tar.gz.

Note that I have not used this myself so I cannot testify as to how well it works. It is a fairly small and simple piece of code however, so it may well serve your purpose nicely.


regex.h does not provide native support for string replacement, however it does provide subexpressions/capture groups which make it much easier. I'll assume that you're familiar with regex compilations and skip to regex execution and subexpressions.

regexec() is defined as follows in regex.h (/usr/include/):

extern int regexec (const regex_t *__restrict __preg,
        const char *__restrict __string, size_t __nmatch,
        regmatch_t __pmatch[__restrict_arr],
        int __eflags);

The first, second, and final arguments are the regex, string to be executed on and execution flags, respectively. The third and fourth arguments are used to specify an array of regmatch_t's. A regmatch_t consists of two fields: rm_so and rm_eo, which are the indices, or offsets, of the beginning and end of the matched area, respectively. Theses indices can then be used along with memcpy(), memset() and memmove()from string.h to perform string replacement.

I'll make a little example and post it later.

Good luck, and I hope that this helped.


I've taken the post by @marnout and fixed it up addressing a number of bugs and typos. Fixes:memory leaks, infinite replacement if replacement contains pattern, printing in function replaced with return values, back reference values actually up to 31, documentation, more test examples.

/* regex_replace.c
:w | !gcc % -o .%<
:w | !gcc % -o .%< && ./.%<
:w | !gcc % -o .%< && valgrind -v ./.%<
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <regex.h>

int regex_replace(char **str, const char *pattern, const char *replace) {
    // replaces regex in pattern with replacement observing capture groups
    // *str MUST be free-able, i.e. obtained by strdup, malloc, ...
    // back references are indicated by char codes 1-31 and none of those chars can be used in the replacement string such as a tab.
    // will not search for matches within replaced text, this will begin searching for the next match after the end of prev match
    // returns:
    //   -1 if pattern cannot be compiled
    //   -2 if count of back references and capture groups don't match
    //   otherwise returns number of matches that were found and replaced
    //
    regex_t reg;
    unsigned int replacements = 0;
    // if regex can't commpile pattern, do nothing
    if(!regcomp(&reg, pattern, REG_EXTENDED)) {
        size_t nmatch = reg.re_nsub;
        regmatch_t m[nmatch + 1];
        const char *rpl, *p;
        // count back references in replace
        int br = 0;
        p = replace;
        while(1) {
            while(*++p > 31);
            if(*p) br++;
            else break;
        } // if br is not equal to nmatch, leave
        if(br != nmatch) {
            regfree(&reg);
            return -2;
        }
        // look for matches and replace
        char *new;
        char *search_start = *str;
        while(!regexec(&reg, search_start, nmatch + 1, m, REG_NOTBOL)) {
            // make enough room
            new = (char *)malloc(strlen(*str) + strlen(replace));
            if(!new) exit(EXIT_FAILURE);
            *new = '\0';
            strncat(new, *str, search_start - *str);
            p = rpl = replace;
            int c;
            strncat(new, search_start, m[0].rm_so); // test before pattern
            for(int k=0; k<nmatch; k++) {
                while(*++p > 31); // skip printable char
                c = *p;  // back reference (e.g. \1, \2, ...)
                strncat(new, rpl, p - rpl); // add head of rpl
                // concat match
                strncat(new, search_start + m[c].rm_so, m[c].rm_eo - m[c].rm_so);
                rpl = p++; // skip back reference, next match
            }
            strcat(new, p ); // trailing of rpl
            unsigned int new_start_offset = strlen(new);
            strcat(new, search_start + m[0].rm_eo); // trailing text in *str
            free(*str);
            *str = (char *)malloc(strlen(new)+1);
            strcpy(*str,new);
            search_start = *str + new_start_offset;
            free(new);
            replacements++;
        }
        regfree(&reg);
        // ajust size
        *str = (char *)realloc(*str, strlen(*str) + 1);
        return replacements;
    } else {
        return -1;
    }
}

const char test1[] = "before [link->address] some text [link2->addr2] trail[a->[b->c]]";
const char *pattern1 = "\\[([^-]+)->([^]]+)\\]";
const char replace1[] = "<a href=\"\2\">\1</a>";

const char test2[] = "abcabcdefghijklmnopqurstuvwxyzabc";
const char *pattern2 = "abc";
const char replace2[] = "!abc";

const char test3[] = "a1a1a1a2ba1";
const char *pattern3 = "a";
const char replace3[] = "aa";
int main(int argc, char *argv[])
{
    char *str1 = (char *)malloc(strlen(test1)+1);
    strcpy(str1,test1);
    puts(str1);
    printf("test 1 Before: [%s], ",str1);
    unsigned int repl_count1 = regex_replace(&str1, pattern1, replace1);
    printf("After replacing %d matches: [%s]\n",repl_count1,str1);
    free(str1);

    char *str2 = (char *)malloc(strlen(test2)+1);
    strcpy(str2,test2);
    puts(str2);
    printf("test 2 Before: [%s], ",str2);
    unsigned int repl_count2 = regex_replace(&str2, pattern2, replace2);
    printf("After replacing %d matches: [%s]\n",repl_count2,str2);
    free(str2);

    char *str3 = (char *)malloc(strlen(test3)+1);
    strcpy(str3,test3);
    puts(str3);
    printf("test 3 Before: [%s], ",str3);
    unsigned int repl_count3 = regex_replace(&str3, pattern3, replace3);
    printf("After replacing %d matches: [%s]\n",repl_count3,str3);
    free(str3);
}

Tags:

C

String

Regex