From cb2d348d48cfd66fa34edd5df8c255cf74170a84 Mon Sep 17 00:00:00 2001
From: Nikola Knezevic <nikola@dfinity.org>
Date: Mon, 6 Jan 2020 16:30:56 +0100
Subject: [PATCH 1/2] Remove redundant check in parseJSONString

---
 src/libexpr/json-to-value.cc | 1 -
 1 file changed, 1 deletion(-)
diff --git a/src/libexpr/json-to-value.cc b/src/libexpr/json-to-value.cc
index 8bae986f9..96cd0fc72 100644
--- a/src/libexpr/json-to-value.cc
+++ b/src/libexpr/json-to-value.cc
@@ -22,7 +22,6 @@ static string parseJSONString(const char * & s)
             if (*s == '"') res += '"';
             else if (*s == '\\') res += '\\';
             else if (*s == '/') res += '/';
-            else if (*s == '/') res += '/';
             else if (*s == 'b') res += '\b';
             else if (*s == 'f') res += '\f';
             else if (*s == 'n') res += '\n';

From 52a8f9295b828872586c5b9e5587064a25dae9b2 Mon Sep 17 00:00:00 2001
From: Nikola Knezevic <nikola@dfinity.org>
Date: Tue, 7 Jan 2020 00:06:49 +0100
Subject: [PATCH 2/2] Add support for \u escape in fromJSON

As fromTOML supports \u and \U escapes, bring fromJSON on par. As JSON defaults
to UTF-8 encoding (every JSON parser must support UTF-8), this change parses the
`\u hex hex hex hex` sequence (\u followed by 4 hexadecimal digits) into an
UTF-8 representation.

Add a test to verify correct parsing, using all escape sequences from json.org.
---
 src/libexpr/json-to-value.cc              | 88 ++++++++++++++++++++++-
 tests/lang/eval-okay-fromjson-escapes.exp |  1 +
 tests/lang/eval-okay-fromjson-escapes.nix |  3 +
 3 files changed, 90 insertions(+), 2 deletions(-)
 create mode 100644 tests/lang/eval-okay-fromjson-escapes.exp
 create mode 100644 tests/lang/eval-okay-fromjson-escapes.nix

diff --git a/src/libexpr/json-to-value.cc b/src/libexpr/json-to-value.cc
index 96cd0fc72..47cab2bb5 100644
--- a/src/libexpr/json-to-value.cc
+++ b/src/libexpr/json-to-value.cc
@@ -11,6 +11,87 @@ static void skipWhitespace(const char * & s)
 }
 
 
+/*
+  Parse an unicode escape sequence (4 hex characters following \u) in JSON string
+*/
+static string parseUnicodeEscapeSequence(const char * & s)
+{
+    int codepoint = 0;
+
+    const auto factors = { 12u, 8u, 4u, 0u };
+    for (const auto factor : factors)
+    {
+        if (!*s) throw JSONParseError("got end-of-string in JSON string while parsing \\u sequence");
+
+        if (*s >= '0' and *s <= '9') {
+            codepoint += static_cast<int>((static_cast<unsigned int>(*s) - 0x30u) << factor);
+        } else if (*s >= 'A' and *s <= 'F') {
+            codepoint += static_cast<int>((static_cast<unsigned int>(*s) - 0x37u) << factor);
+        } else if (*s >= 'a' and *s <= 'f') {
+            codepoint += static_cast<int>((static_cast<unsigned int>(*s) - 0x57u) << factor);
+        } else {
+            throw JSONParseError(format("illegal character '%1%' in \\u escape sequence.") % *s);
+        }
+        s++;
+    }
+
+    if ((codepoint > 0xd7ff && codepoint < 0xe000) || codepoint > 0x10ffff) {
+        throw JSONParseError("Unicode escape sequence is not a Unicode scalar value");
+    }
+
+    // taken from cpptoml.h
+    std::string result;
+    // See Table 3-6 of the Unicode standard
+    if (codepoint <= 0x7f)
+    {
+        // 1-byte codepoints: 00000000 0xxxxxxx
+        // repr: 0xxxxxxx
+        result += static_cast<char>(codepoint & 0x7f);
+    }
+    else if (codepoint <= 0x7ff)
+    {
+        // 2-byte codepoints: 00000yyy yyxxxxxx
+        // repr: 110yyyyy 10xxxxxx
+        //
+        // 0x1f = 00011111
+        // 0xc0 = 11000000
+        //
+        result += static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f));
+        //
+        // 0x80 = 10000000
+        // 0x3f = 00111111
+        //
+        result += static_cast<char>(0x80 | (codepoint & 0x3f));
+    }
+    else if (codepoint <= 0xffff)
+    {
+        // 3-byte codepoints: zzzzyyyy yyxxxxxx
+        // repr: 1110zzzz 10yyyyyy 10xxxxxx
+        //
+        // 0xe0 = 11100000
+        // 0x0f = 00001111
+        //
+        result += static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f));
+        result += static_cast<char>(0x80 | ((codepoint >> 6) & 0x1f));
+        result += static_cast<char>(0x80 | (codepoint & 0x3f));
+    }
+    else
+    {
+        // 4-byte codepoints: 000uuuuu zzzzyyyy yyxxxxxx
+        // repr: 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
+        //
+        // 0xf0 = 11110000
+        // 0x07 = 00000111
+        //
+        result += static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07));
+        result += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f));
+        result += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
+        result += static_cast<char>(0x80 | (codepoint & 0x3f));
+    }
+    return result;
+}
+
+
 static string parseJSONString(const char * & s)
 {
     string res;
@@ -27,8 +108,11 @@ static string parseJSONString(const char * & s)
             else if (*s == 'n') res += '\n';
             else if (*s == 'r') res += '\r';
             else if (*s == 't') res += '\t';
-            else if (*s == 'u') throw JSONParseError("\\u characters in JSON strings are currently not supported");
-            else throw JSONParseError("invalid escaped character in JSON string");
+            else if (*s == 'u') {
+                res += parseUnicodeEscapeSequence(++s);
+                // to neuter the outside s++
+                s--;
+            } else throw JSONParseError("invalid escaped character in JSON string");
             s++;
         } else
             res += *s++;
diff --git a/tests/lang/eval-okay-fromjson-escapes.exp b/tests/lang/eval-okay-fromjson-escapes.exp
new file mode 100644
index 000000000..add5505a8
--- /dev/null
+++ b/tests/lang/eval-okay-fromjson-escapes.exp
@@ -0,0 +1 @@
+"quote \" reverse solidus \\ solidus / backspace  formfeed  newline \n carriage return \r horizontal tab \t 1 char unicode encoded backspace  1 char unicode encoded e with accent é 2 char unicode encoded s with caron š 3 char unicode encoded rightwards arrow →"
diff --git a/tests/lang/eval-okay-fromjson-escapes.nix b/tests/lang/eval-okay-fromjson-escapes.nix
new file mode 100644
index 000000000..f00713507
--- /dev/null
+++ b/tests/lang/eval-okay-fromjson-escapes.nix
@@ -0,0 +1,3 @@
+# This string contains all supported escapes in a JSON string, per json.org
+# \b and \f are not supported by Nix
+builtins.fromJSON ''"quote \" reverse solidus \\ solidus \/ backspace \b formfeed \f newline \n carriage return \r horizontal tab \t 1 char unicode encoded backspace \u0008 1 char unicode encoded e with accent \u00e9 2 char unicode encoded s with caron \u0161 3 char unicode encoded rightwards arrow \u2192"''