blob: d1511de08ede19b9ac7fbdc9c71bf755d6e940f3 [file] [log] [blame]
/*
*
* Copyright (c) 2023 Project CHIP Authors
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "utf8.h"
namespace chip {
namespace Utf8 {
namespace {
/**
State machine for UTF8 valid bytes
Table 3-7. Well-Formed UTF-8 Byte Sequences
Code Points | First B | Second B | Third B | Fourth B
------------------+----------+------------+---------+---------
U+0000..U+007F | 00..7F | | |
U+0080..U+07FF | C2..DF | 80..BF | |
U+0800..U+0FFF | E0 | A0..BF (A) | 80..BF |
U+1000..U+CFFF | E1..EC | 80..BF | 80..BF |
U+D000..U+D7FF | ED | 80..9F (B) | 80..BF |
U+E000..U+FFFF | EE..EF | 80..BF | 80..BF |
U+10000..U+3FFFF | F0 | 90..BF (C) | 80..BF | 80..BF
U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF
U+100000..U+10FFFF| F4 | 80..8F (D) | 80..BF | 80..BF
*/
enum class ParserState
{
kFirstByte,
kSecondByte_A,
kSecondByte_B,
kSecondByte_C,
kSecondByte_D,
kExtraOneByte, // 0x80 .. 0xBF once
kExtraTwoBytes, // 0x80 .. 0xBF twice
kExtraThreeBytes, // 0x80 .. 0xBF three times
//
kInvalid, // some error
};
ParserState NextState(ParserState state, uint8_t value)
{
switch (state)
{
case ParserState::kFirstByte:
if (value <= 0x7F)
{
return ParserState::kFirstByte;
}
else if ((value >= 0xC2) && (value <= 0xDF))
{
return ParserState::kExtraOneByte;
}
else if (value == 0xE0)
{
return ParserState::kSecondByte_A;
}
else if ((value >= 0xE1) && (value <= 0xEC))
{
return ParserState::kExtraTwoBytes;
}
else if (value == 0xED)
{
return ParserState::kSecondByte_B;
}
else if ((value >= 0xEE) && (value <= 0xEF))
{
return ParserState::kExtraTwoBytes;
}
else if (value == 0xF0)
{
return ParserState::kSecondByte_C;
}
else if ((value >= 0xF1) && (value <= 0xF3))
{
return ParserState::kExtraThreeBytes;
}
else if (value == 0xF4)
{
return ParserState::kSecondByte_D;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kSecondByte_A:
if (value >= 0xA0 && value <= 0xBF)
{
return ParserState::kExtraOneByte;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kSecondByte_B:
if (value >= 0x80 && value <= 0x9F)
{
return ParserState::kExtraOneByte;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kSecondByte_C:
if (value >= 0x90 && value <= 0xBF)
{
return ParserState::kExtraTwoBytes;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kSecondByte_D:
if (value >= 0x80 && value <= 0x8F)
{
return ParserState::kExtraTwoBytes;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kExtraOneByte:
if (value >= 0x80 && value <= 0xBF)
{
return ParserState::kFirstByte;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kExtraTwoBytes:
if (value >= 0x80 && value <= 0xBF)
{
return ParserState::kExtraOneByte;
}
else
{
return ParserState::kInvalid;
}
case ParserState::kExtraThreeBytes:
if (value >= 0x80 && value <= 0xBF)
{
return ParserState::kExtraTwoBytes;
}
else
{
return ParserState::kInvalid;
}
default:
return ParserState::kInvalid;
}
}
} // namespace
bool IsValid(CharSpan span)
{
ParserState state = ParserState::kFirstByte;
const char * data = span.data();
const size_t kLength = span.size();
// Every byte should be valid
for (size_t i = 0; i < kLength; i++)
{
state = NextState(state, static_cast<uint8_t>(data[i]));
if (state == ParserState::kInvalid)
{
return false;
}
}
// finally no continuation should be expected
return state == ParserState::kFirstByte;
}
} // namespace Utf8
} // namespace chip