<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://linguifex.com/w/index.php?action=history&amp;feed=atom&amp;title=Module%3Astring%2Fisutf8</id>
	<title>Module:string/isutf8 - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://linguifex.com/w/index.php?action=history&amp;feed=atom&amp;title=Module%3Astring%2Fisutf8"/>
	<link rel="alternate" type="text/html" href="https://linguifex.com/w/index.php?title=Module:string/isutf8&amp;action=history"/>
	<updated>2026-04-09T04:20:56Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.43.6</generator>
	<entry>
		<id>https://linguifex.com/w/index.php?title=Module:string/isutf8&amp;diff=477618&amp;oldid=prev</id>
		<title>Sware: Created page with &quot;local byte = string.byte local match = string.match  --[==[Returns {true} if `str` is a valid UTF-8 string. This is true if, for each character, all of the following are true: * It has the expected number of bytes, which is determined by value of the leading byte: 1-byte characters are `0x00` to `0x7F`, 2-byte characters start with `0xC2` to `0xDF`, 3-byte characters start with `0xE0` to `0xEF`, and 4-byte characters start with `0xF0` to `0xF4`. * The leading byte must n...&quot;</title>
		<link rel="alternate" type="text/html" href="https://linguifex.com/w/index.php?title=Module:string/isutf8&amp;diff=477618&amp;oldid=prev"/>
		<updated>2025-11-15T15:20:24Z</updated>

		<summary type="html">&lt;p&gt;Created page with &amp;quot;local byte = string.byte local match = string.match  --[==[Returns {true} if `str` is a valid UTF-8 string. This is true if, for each character, all of the following are true: * It has the expected number of bytes, which is determined by value of the leading byte: 1-byte characters are `0x00` to `0x7F`, 2-byte characters start with `0xC2` to `0xDF`, 3-byte characters start with `0xE0` to `0xEF`, and 4-byte characters start with `0xF0` to `0xF4`. * The leading byte must n...&amp;quot;&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt;local byte = string.byte&lt;br /&gt;
local match = string.match&lt;br /&gt;
&lt;br /&gt;
--[==[Returns {true} if `str` is a valid UTF-8 string. This is true if, for each character, all of the following are true:&lt;br /&gt;
* It has the expected number of bytes, which is determined by value of the leading byte: 1-byte characters are `0x00` to `0x7F`, 2-byte characters start with `0xC2` to `0xDF`, 3-byte characters start with `0xE0` to `0xEF`, and 4-byte characters start with `0xF0` to `0xF4`.&lt;br /&gt;
* The leading byte must not fall outside of the above ranges.&lt;br /&gt;
* The trailing byte(s) (if any), must be between `0x80` to `0xBF`.&lt;br /&gt;
* The character&amp;#039;s codepoint must be between U+0000 (`0x00`) and U+10FFFF (`0xF4 0x8F 0xBF 0xBF`).&lt;br /&gt;
* The character cannot have an overlong encoding: for each byte length, the lowest theoretical encoding is equivalent to U+0000 (e.g. `0xE0 0x80 0x80`, the lowest theoretical 3-byte encoding, is exactly equivalent to U+0000). Encodings that use more than the minimum number of bytes are not considered valid, meaning that the first valid 3-byte character is `0xE0 0xA0 0x80` (U+0800), and the first valid 4-byte character is `0xF0 0x90 0x80 0x80` (U+10000). Formally, 2-byte characters have leading bytes ranging from `0xC0` to `0xDF` (rather than `0xC2` to `0xDF`), but `0xC0 0x80` to `0xC1 0xBF` are overlong encodings, so it is simpler to say that the 2-byte range begins at `0xC2`.&lt;br /&gt;
&lt;br /&gt;
If `allow_surrogates` is set, surrogates (U+D800 to U+DFFF) will be treated as valid UTF-8. Surrogates are used in UTF-16, which encodes codepoints U+0000 to U+FFFF with 2 bytes, and codepoints from U+10000 upwards using a pair of surrogates, which are taken together as a 4-byte unit. Since surrogates have no use in UTF-8, as it encodes higher codepoints in a different way, they are not considered valid in UTF-8 text. However, there are limited circumstances where they may be necessary: for instance, JSON escapes characters using the format `\u0000`, which must contain exactly 4 hexadecimal digits; under the scheme, codepoints above U+FFFF must be escaped as the equivalent pair of surrogates, even though the text itself must be encoded in UTF-8 (e.g. U+10000 becomes `\uD800\uDC00`).]==]&lt;br /&gt;
return function(str, allow_surrogates)&lt;br /&gt;
	local loc, str_len = 1, #str&lt;br /&gt;
	while true do&lt;br /&gt;
		-- Skipping ASCII bytes with [^\128-\255]* is much faster than searching&lt;br /&gt;
		-- for [\128-\255].&lt;br /&gt;
		loc = match(str, &amp;quot;^[^\128-\255]*()&amp;quot;, loc)&lt;br /&gt;
		if loc &amp;gt; str_len then&lt;br /&gt;
			return true&lt;br /&gt;
		end&lt;br /&gt;
		-- Grab 5 bytes (i.e. at least one extra), to determine when the loop&lt;br /&gt;
		-- should break.&lt;br /&gt;
		local b1, b2, b3, b4, b5 = byte(str, loc, loc + 4)&lt;br /&gt;
		while true do&lt;br /&gt;
			-- 1 byte can&amp;#039;t be valid, as the 1-byte characters \x00-\x7F are&lt;br /&gt;
			-- ignored. The leading bytes for 2-byte encodings are formally&lt;br /&gt;
			-- [\xC0-\xDF], but [\xC0\xC1] always form overlong encodings.&lt;br /&gt;
			if b1 &amp;lt; 0xC2 or not b2 or b2 &amp;gt; 0xBF then&lt;br /&gt;
				return false&lt;br /&gt;
			-- 2 bytes: [\xC2-\xDF]...&lt;br /&gt;
			elseif b1 &amp;lt; 0xE0 then&lt;br /&gt;
				if b2 &amp;lt; 0x80 then&lt;br /&gt;
					return false&lt;br /&gt;
				end&lt;br /&gt;
				loc = loc + 2&lt;br /&gt;
				if not b3 then&lt;br /&gt;
					return true&lt;br /&gt;
				elseif b3 &amp;lt; 0x80 then&lt;br /&gt;
					break&lt;br /&gt;
				end&lt;br /&gt;
				b1, b2, b3, b4, b5 = b3, b4, b5, byte(str, loc + 3, loc + 4)&lt;br /&gt;
			-- Trailing byte: [\x80-\xBF].&lt;br /&gt;
			elseif not b3 or b3 &amp;lt; 0x80 or b3 &amp;gt; 0xBF then&lt;br /&gt;
				return false&lt;br /&gt;
			-- 3 bytes: [\xE0-\xEF]...&lt;br /&gt;
			elseif b1 &amp;lt; 0xF0 then&lt;br /&gt;
				-- If b2 is [\x80-\x9F], exclude \xE0[\x80-\x9F]..., which are&lt;br /&gt;
				-- overlong encodings.&lt;br /&gt;
				if b2 &amp;lt; 0xA0 then&lt;br /&gt;
					if b1 &amp;lt; 0xE1 then&lt;br /&gt;
						return false&lt;br /&gt;
					end&lt;br /&gt;
				-- The remaining b2 values [\xA0-\xBF] can form the surrogates&lt;br /&gt;
				-- \xED[\xA0-\xBF]...&lt;br /&gt;
				elseif b1 &amp;lt; 0xE0 or b1 == 0xED and not allow_surrogates then&lt;br /&gt;
					return false&lt;br /&gt;
				end&lt;br /&gt;
				loc = loc + 3&lt;br /&gt;
				if not b4 then&lt;br /&gt;
					return true&lt;br /&gt;
				elseif b4 &amp;lt; 0x80 then&lt;br /&gt;
					break&lt;br /&gt;
				end&lt;br /&gt;
				b1, b2, b3, b4, b5 = b4, b5, byte(str, loc + 2, loc + 4)&lt;br /&gt;
			-- Trailing byte: [\x80-\xBF].&lt;br /&gt;
			elseif not b4 or b4 &amp;lt; 0x80 or b4 &amp;gt; 0xBF then&lt;br /&gt;
				return false&lt;br /&gt;
			-- 4-bytes: [\xF0-\xF4]...&lt;br /&gt;
			-- If b2 is [\x80-\x8F], exclude \xF0[\x80-\x8F]..., which are&lt;br /&gt;
			-- overlong encodings, but allow \xF4 as a leading byte, since&lt;br /&gt;
			-- \xF4\8F\BF\BF is the highest valid codepoint (U+10FFFF).&lt;br /&gt;
			else&lt;br /&gt;
				if b2 &amp;lt; 0x90 then&lt;br /&gt;
					if b1 &amp;lt; 0xF1 or b1 &amp;gt; 0xF4 then&lt;br /&gt;
						return false&lt;br /&gt;
					end&lt;br /&gt;
				elseif b1 &amp;lt; 0xF0 or b1 &amp;gt; 0xF3 or b2 &amp;lt; 0x80 then&lt;br /&gt;
					return false&lt;br /&gt;
				end&lt;br /&gt;
				loc = loc + 4&lt;br /&gt;
				if not b5 then&lt;br /&gt;
					return true&lt;br /&gt;
				elseif b5 &amp;lt; 0x80 then&lt;br /&gt;
					break&lt;br /&gt;
				end&lt;br /&gt;
				b1, b2, b3, b4, b5 = b5, byte(str, loc + 1, loc + 4)&lt;br /&gt;
			end&lt;br /&gt;
		end&lt;br /&gt;
	end&lt;br /&gt;
end&lt;/div&gt;</summary>
		<author><name>Sware</name></author>
	</entry>
</feed>