Regular expression for a language tag (as defined by BCP47)

An optimized version that works in PHP.

/^(?<grandfathered>(?:en-GB-oed|i-(?:ami|bnn|default|enochian|hak|klingon|lux|mingo|navajo|pwn|t(?:a[oy]|su))|sgn-(?:BE-(?:FR|NL)|CH-DE))|(?:art-lojban|cel-gaulish|no-(?:bok|nyn)|zh-(?:guoyu|hakka|min(?:-nan)?|xiang)))|(?:(?<language>(?:[A-Za-z]{2,3}(?:-(?<extlang>[A-Za-z]{3}(?:-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(?:-(?<script>[A-Za-z]{4}))?(?:-(?<region>[A-Za-z]{2}|[0-9]{3}))?(?:-(?<variant>[A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(?:-(?<extension>[0-9A-WY-Za-wy-z](?:-[A-Za-z0-9]{2,8})+))*)(?:-(?<privateUse>x(?:-[A-Za-z0-9]{1,8})+))?$/Di

Javascript polices duplicate named capture groups so you have to change the 2nd use of ?<privateUse> to e.g. ?<privateUse1>. Compiles to:

/^((?<grandfathered>(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang))|((?<language>([A-Za-z]{2,3}(-(?<extlang>[A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-(?<script>[A-Za-z]{4}))?(-(?<region>[A-Za-z]{2}|[0-9]{3}))?(-(?<variant>[A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-(?<extension>[0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(?<privateUse>x(-[A-Za-z0-9]{1,8})+))?)|(?<privateUse1>x(-[A-Za-z0-9]{1,8})+))$/

Here's a way to construct it:

let privateUseUsed = 0
const privateUse = () => "(?<privateUse" + (privateUseUsed++) + ">x(-[A-Za-z0-9]{1,8})+)"
const grandfathered = "(?<grandfathered>" +
      /* irregular */ (
        "en-GB-oed" +
          "|" + "i-(?:ami|bnn|default|enochian|hak|klingon|lux|mingo|navajo|pwn|tao|tay|tsu)" +
          "|" + "sgn-(?:BE-FR|BE-NL|CH-DE)"
      ) +
      "|" + /* regular */ (
        "art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang"
      ) +
      ")"
const langtag = "(" +
      "(?<language>" + (
        "([A-Za-z]{2,3}(-" +
          "(?<extlang>[A-Za-z]{3}(-[A-Za-z]{3}){0,2})" +
          ")?)|[A-Za-z]{4,8})"
      ) +
      "(-" + "(?<script>[A-Za-z]{4})" + ")?" +
      "(-" + "(?<region>[A-Za-z]{2}|[0-9]{3})" + ")?" +
      "(-" + "(?<variant>[A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3})" + ")*" +
      "(-" + "(?<extension>" + (
        /* singleton */ "[0-9A-WY-Za-wy-z]" +
          "(-[A-Za-z0-9]{2,8})+)"
      ) +
      ")*" +
      "(-" + privateUse() + ")?" +
      ")"
const languageTagReStr = "^(" + grandfathered + "|" + langtag + "|" + privateUse() + ")$";

Edit: turns out ff doens't support named capture groups so you have to strip them out with .replace(/\?<a-zA-Z>/g, '') or jest leave them out in the first place:

const grandfathered = "(" +
      /* irregular */ "(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)" +
      "|" +
      /* regular */ "(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)" +
      ")";
const langtag = "(" +
      "(" + (
        "([A-Za-z]{2,3}(-" +
          "([A-Za-z]{3}(-[A-Za-z]{3}){0,2})" +
          ")?)|[A-Za-z]{4}|[A-Za-z]{5,8})"
      ) +
      "(-" + "([A-Za-z]{4})" + ")?" +
      "(-" + "([A-Za-z]{2}|[0-9]{3})" + ")?" +
      "(-" + "([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3})" + ")*" +
      "(-" + "(" + (
        /* singleton */ "[0-9A-WY-Za-wy-z]" +
          "(-[A-Za-z0-9]{2,8})+)"
      ) +
      ")*" +
      "(-" + "(x(-[A-Za-z0-9]{1,8})+)" + ")?" +
      ")";
const languageTag = RegExp("^(" + grandfathered + "|" + langtag + "|" + "(x(-[A-Za-z0-9]{1,8})+)" + ")$");

Test with languageTag.test('en-us')


Looks like this:

^((?<grandfathered>(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|
i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|
cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang))|((?<language>
([A-Za-z]{2,3}(-(?<extlang>[A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})
(-(?<script>[A-Za-z]{4}))?(-(?<region>[A-Za-z]{2}|[0-9]{3}))?(-(?<variant>[A-Za-z0-9]{5,8}
|[0-9][A-Za-z0-9]{3}))*(-(?<extension>[0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*
(-(?<privateUse>x(-[A-Za-z0-9]{1,8})+))?)|(?<privateUse>x(-[A-Za-z0-9]{1,8})+))$

Here is the code to generate it (in C#):

var regular = "(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)";
var irregular = "(en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)";
var grandfathered = "(?<grandfathered>" + irregular + "|" + regular + ")";
var privateUse = "(?<privateUse>x(-[A-Za-z0-9]{1,8})+)";
var singleton = "[0-9A-WY-Za-wy-z]";
var extension = "(?<extension>" + singleton + "(-[A-Za-z0-9]{2,8})+)";
var variant = "(?<variant>[A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3})";
var region = "(?<region>[A-Za-z]{2}|[0-9]{3})";
var script = "(?<script>[A-Za-z]{4})";
var extlang = "(?<extlang>[A-Za-z]{3}(-[A-Za-z]{3}){0,2})";
var language = "(?<language>([A-Za-z]{2,3}(-" + extlang + ")?)|[A-Za-z]{4}|[A-Za-z]{5,8})";
var langtag = "(" + language + "(-" + script + ")?" + "(-" + region + ")?" + "(-" + variant + ")*" + "(-" + extension + ")*" + "(-" + privateUse + ")?" + ")";
var languageTag = @"^(" + grandfathered + "|" + langtag + "|" + privateUse + ")$";

Console.WriteLine(languageTag);

I cannot guarantee its correctness (I may have made typos), but it works fine on the examples in Appendix A.

Depending on your environment, you might need to remove the named capturing groups "?<...>".

Tags:

Regex

Bnf