Files
bitcoin/lib/univalue_utffilter.h
fanquake 9b49ed656f Squashed 'src/univalue/' changes from 98fadc0909..a44caf65fe
a44caf65fe Merge bitcoin-core/univalue-subtree#28: Import fixes for sanitizer reported issues
135254331e Import fixes for sanitizer reported issues
d5fb86940e refactor: use c++11 range based for loop in checkObject
ff9c379304 refactor: Use nullptr (c++11) instead of NULL
08a99754d5 build: use ax_cxx_compile_stdcxx.m4 to check for C++11 support
66d3713ce7 Merge bitcoin-core/univalue#29: ci: travis -> cirrus
808d487292 ci: travis -> cirrus
c390ac375f Merge bitcoin-core/univalue#19: Split sources for easier buildsystem integration
4a5b0a1c65 build: Move source entries out to sources.mk
6c7d94b33c build: cleanup wonky gen usage
a222637c6d Merge #23: Merge changes from jgarzik/univalue@1ae6a23
f77d0f718d Merge commit '1ae6a231a0169938eb3972c1d48dd17cba5947e1' into HEAD
1ae6a231a0 Merge pull request #57 from MarcoFalke/test_fix
92bdd11f0b univalue_write: remove unneeded sstream.h include
ffb621c130 Merge pull request #56 from drodil/remove_sstream_header
f33acf9fe8 Merge commit '7890db9~' into HEAD
66e0adec4d Remove unnecessary sstream header from univalue.h
88967f6586 Version 1.0.4
1dc113dbef Merge pull request #50 from luke-jr/pushKV_bool
72392fb227 [tests] test pushKV for boolean values
c23132bcf4 Pushing boolean value to univalue correctly
81faab26a1 Merge pull request #48 from fwolfst/47-UPDATE_MIT_LINK_TO_HTTPS
b17634ef24 Update URLs to MIT license.
88ab64f6b5 Merge pull request #46 from jasonbcox/master
35ed96da31 Merge pull request #44 from MarcoFalke/Mf1709-univalue-cherrypick-explicit
420c226290 Merge pull request #45 from MarcoFalke/Mf1710-univalue-revert-test

git-subtree-dir: src/univalue
git-subtree-split: a44caf65fe55b9dd8ddb08f04c0f70409efd53b3
2021-10-11 20:45:56 +08:00

120 lines
4.6 KiB
C++

// Copyright 2016 Wladimir J. van der Laan
// Distributed under the MIT software license, see the accompanying
// file COPYING or https://opensource.org/licenses/mit-license.php.
#ifndef UNIVALUE_UTFFILTER_H
#define UNIVALUE_UTFFILTER_H
#include <string>
/**
* Filter that generates and validates UTF-8, as well as collates UTF-16
* surrogate pairs as specified in RFC4627.
*/
class JSONUTF8StringFilter
{
public:
explicit JSONUTF8StringFilter(std::string &s):
str(s), is_valid(true), codepoint(0), state(0), surpair(0)
{
}
// Write single 8-bit char (may be part of UTF-8 sequence)
void push_back(unsigned char ch)
{
if (state == 0) {
if (ch < 0x80) // 7-bit ASCII, fast direct pass-through
str.push_back(ch);
else if (ch < 0xc0) // Mid-sequence character, invalid in this state
is_valid = false;
else if (ch < 0xe0) { // Start of 2-byte sequence
codepoint = (ch & 0x1f) << 6;
state = 6;
} else if (ch < 0xf0) { // Start of 3-byte sequence
codepoint = (ch & 0x0f) << 12;
state = 12;
} else if (ch < 0xf8) { // Start of 4-byte sequence
codepoint = (ch & 0x07) << 18;
state = 18;
} else // Reserved, invalid
is_valid = false;
} else {
if ((ch & 0xc0) != 0x80) // Not a continuation, invalid
is_valid = false;
state -= 6;
codepoint |= (ch & 0x3f) << state;
if (state == 0)
push_back_u(codepoint);
}
}
// Write codepoint directly, possibly collating surrogate pairs
void push_back_u(unsigned int codepoint_)
{
if (state) // Only accept full codepoints in open state
is_valid = false;
if (codepoint_ >= 0xD800 && codepoint_ < 0xDC00) { // First half of surrogate pair
if (surpair) // Two subsequent surrogate pair openers - fail
is_valid = false;
else
surpair = codepoint_;
} else if (codepoint_ >= 0xDC00 && codepoint_ < 0xE000) { // Second half of surrogate pair
if (surpair) { // Open surrogate pair, expect second half
// Compute code point from UTF-16 surrogate pair
append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint_ - 0xDC00));
surpair = 0;
} else // Second half doesn't follow a first half - fail
is_valid = false;
} else {
if (surpair) // First half of surrogate pair not followed by second - fail
is_valid = false;
else
append_codepoint(codepoint_);
}
}
// Check that we're in a state where the string can be ended
// No open sequences, no open surrogate pairs, etc
bool finalize()
{
if (state || surpair)
is_valid = false;
return is_valid;
}
private:
std::string &str;
bool is_valid;
// Current UTF-8 decoding state
unsigned int codepoint;
int state; // Top bit to be filled in for next UTF-8 byte, or 0
// Keep track of the following state to handle the following section of
// RFC4627:
//
// To escape an extended character that is not in the Basic Multilingual
// Plane, the character is represented as a twelve-character sequence,
// encoding the UTF-16 surrogate pair. So, for example, a string
// containing only the G clef character (U+1D11E) may be represented as
// "\uD834\uDD1E".
//
// Two subsequent \u.... may have to be replaced with one actual codepoint.
unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0
void append_codepoint(unsigned int codepoint_)
{
if (codepoint_ <= 0x7f)
str.push_back((char)codepoint_);
else if (codepoint_ <= 0x7FF) {
str.push_back((char)(0xC0 | (codepoint_ >> 6)));
str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
} else if (codepoint_ <= 0xFFFF) {
str.push_back((char)(0xE0 | (codepoint_ >> 12)));
str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F)));
str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
} else if (codepoint_ <= 0x1FFFFF) {
str.push_back((char)(0xF0 | (codepoint_ >> 18)));
str.push_back((char)(0x80 | ((codepoint_ >> 12) & 0x3F)));
str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F)));
str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
}
}
};
#endif