diff --git a/libcockatrice_utility/libcockatrice/utility/peglib.h b/libcockatrice_utility/libcockatrice/utility/peglib.h index 6a5b87b2d..3ae6040c4 100644 --- a/libcockatrice_utility/libcockatrice/utility/peglib.h +++ b/libcockatrice_utility/libcockatrice/utility/peglib.h @@ -1,4 +1,4 @@ -// +// // peglib.h // // Copyright (c) 2022 Yuji Hirose. All rights reserved. @@ -17,6 +17,7 @@ #include #include +#include #include #include #if __has_include() @@ -30,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -51,28 +53,28 @@ namespace peg { // "http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2014/n4189". template struct scope_exit { - explicit scope_exit(EF &&f) - : exit_function(std::move(f)), execute_on_destruction{true} {} + explicit scope_exit(EF &&f) + : exit_function(std::move(f)), execute_on_destruction{true} {} - scope_exit(scope_exit &&rhs) - : exit_function(std::move(rhs.exit_function)), - execute_on_destruction{rhs.execute_on_destruction} { - rhs.release(); - } + scope_exit(scope_exit &&rhs) + : exit_function(std::move(rhs.exit_function)), + execute_on_destruction{rhs.execute_on_destruction} { + rhs.release(); + } - ~scope_exit() { - if (execute_on_destruction) { this->exit_function(); } - } + ~scope_exit() { + if (execute_on_destruction) { this->exit_function(); } + } - void release() { this->execute_on_destruction = false; } + void release() { this->execute_on_destruction = false; } private: - scope_exit(const scope_exit &) = delete; - void operator=(const scope_exit &) = delete; - scope_exit &operator=(scope_exit &&) = delete; + scope_exit(const scope_exit &) = delete; + void operator=(const scope_exit &) = delete; + scope_exit &operator=(scope_exit &&) = delete; - EF exit_function; - bool execute_on_destruction; + EF exit_function; + bool execute_on_destruction; }; /*----------------------------------------------------------------------------- @@ -80,130 +82,136 @@ private: *---------------------------------------------------------------------------*/ inline size_t codepoint_length(const char *s8, size_t l) { - if (l) { - auto b = static_cast(s8[0]); - if ((b & 0x80) == 0) { - return 1; - } else if ((b & 0xE0) == 0xC0 && l >= 2) { - return 2; - } else if ((b & 0xF0) == 0xE0 && l >= 3) { - return 3; - } else if ((b & 0xF8) == 0xF0 && l >= 4) { - return 4; - } + if (l) { + auto b = static_cast(s8[0]); + if ((b & 0x80) == 0) { + return 1; + } else if ((b & 0xE0) == 0xC0 && l >= 2) { + return 2; + } else if ((b & 0xF0) == 0xE0 && l >= 3) { + return 3; + } else if ((b & 0xF8) == 0xF0 && l >= 4) { + return 4; } - return 0; + } + return 0; } inline size_t codepoint_count(const char *s8, size_t l) { - size_t count = 0; - for (size_t i = 0; i < l; i += codepoint_length(s8 + i, l - i)) { - count++; + size_t count = 0; + for (size_t i = 0; i < l;) { + auto len = codepoint_length(s8 + i, l - i); + if (len == 0) { + // Invalid UTF-8 byte, treat as single byte to avoid infinite loop + len = 1; } - return count; + i += len; + count++; + } + return count; } inline size_t encode_codepoint(char32_t cp, char *buff) { - if (cp < 0x0080) { - buff[0] = static_cast(cp & 0x7F); - return 1; - } else if (cp < 0x0800) { - buff[0] = static_cast(0xC0 | ((cp >> 6) & 0x1F)); - buff[1] = static_cast(0x80 | (cp & 0x3F)); - return 2; - } else if (cp < 0xD800) { - buff[0] = static_cast(0xE0 | ((cp >> 12) & 0xF)); - buff[1] = static_cast(0x80 | ((cp >> 6) & 0x3F)); - buff[2] = static_cast(0x80 | (cp & 0x3F)); - return 3; - } else if (cp < 0xE000) { - // D800 - DFFF is invalid... - return 0; - } else if (cp < 0x10000) { - buff[0] = static_cast(0xE0 | ((cp >> 12) & 0xF)); - buff[1] = static_cast(0x80 | ((cp >> 6) & 0x3F)); - buff[2] = static_cast(0x80 | (cp & 0x3F)); - return 3; - } else if (cp < 0x110000) { - buff[0] = static_cast(0xF0 | ((cp >> 18) & 0x7)); - buff[1] = static_cast(0x80 | ((cp >> 12) & 0x3F)); - buff[2] = static_cast(0x80 | ((cp >> 6) & 0x3F)); - buff[3] = static_cast(0x80 | (cp & 0x3F)); - return 4; - } + if (cp < 0x0080) { + buff[0] = static_cast(cp & 0x7F); + return 1; + } else if (cp < 0x0800) { + buff[0] = static_cast(0xC0 | ((cp >> 6) & 0x1F)); + buff[1] = static_cast(0x80 | (cp & 0x3F)); + return 2; + } else if (cp < 0xD800) { + buff[0] = static_cast(0xE0 | ((cp >> 12) & 0xF)); + buff[1] = static_cast(0x80 | ((cp >> 6) & 0x3F)); + buff[2] = static_cast(0x80 | (cp & 0x3F)); + return 3; + } else if (cp < 0xE000) { + // D800 - DFFF is invalid... return 0; + } else if (cp < 0x10000) { + buff[0] = static_cast(0xE0 | ((cp >> 12) & 0xF)); + buff[1] = static_cast(0x80 | ((cp >> 6) & 0x3F)); + buff[2] = static_cast(0x80 | (cp & 0x3F)); + return 3; + } else if (cp < 0x110000) { + buff[0] = static_cast(0xF0 | ((cp >> 18) & 0x7)); + buff[1] = static_cast(0x80 | ((cp >> 12) & 0x3F)); + buff[2] = static_cast(0x80 | ((cp >> 6) & 0x3F)); + buff[3] = static_cast(0x80 | (cp & 0x3F)); + return 4; + } + return 0; } inline std::string encode_codepoint(char32_t cp) { - char buff[4]; - auto l = encode_codepoint(cp, buff); - return std::string(buff, l); + char buff[4]; + auto l = encode_codepoint(cp, buff); + return std::string(buff, l); } inline bool decode_codepoint(const char *s8, size_t l, size_t &bytes, char32_t &cp) { - if (l) { - auto b = static_cast(s8[0]); - if ((b & 0x80) == 0) { - bytes = 1; - cp = b; - return true; - } else if ((b & 0xE0) == 0xC0) { - if (l >= 2) { - bytes = 2; - cp = ((static_cast(s8[0] & 0x1F)) << 6) | - (static_cast(s8[1] & 0x3F)); - return true; - } - } else if ((b & 0xF0) == 0xE0) { - if (l >= 3) { - bytes = 3; - cp = ((static_cast(s8[0] & 0x0F)) << 12) | - ((static_cast(s8[1] & 0x3F)) << 6) | - (static_cast(s8[2] & 0x3F)); - return true; - } - } else if ((b & 0xF8) == 0xF0) { - if (l >= 4) { - bytes = 4; - cp = ((static_cast(s8[0] & 0x07)) << 18) | - ((static_cast(s8[1] & 0x3F)) << 12) | - ((static_cast(s8[2] & 0x3F)) << 6) | - (static_cast(s8[3] & 0x3F)); - return true; - } - } + if (l) { + auto b = static_cast(s8[0]); + if ((b & 0x80) == 0) { + bytes = 1; + cp = b; + return true; + } else if ((b & 0xE0) == 0xC0) { + if (l >= 2) { + bytes = 2; + cp = ((static_cast(s8[0] & 0x1F)) << 6) | + (static_cast(s8[1] & 0x3F)); + return true; + } + } else if ((b & 0xF0) == 0xE0) { + if (l >= 3) { + bytes = 3; + cp = ((static_cast(s8[0] & 0x0F)) << 12) | + ((static_cast(s8[1] & 0x3F)) << 6) | + (static_cast(s8[2] & 0x3F)); + return true; + } + } else if ((b & 0xF8) == 0xF0) { + if (l >= 4) { + bytes = 4; + cp = ((static_cast(s8[0] & 0x07)) << 18) | + ((static_cast(s8[1] & 0x3F)) << 12) | + ((static_cast(s8[2] & 0x3F)) << 6) | + (static_cast(s8[3] & 0x3F)); + return true; + } } - return false; + } + return false; } inline size_t decode_codepoint(const char *s8, size_t l, char32_t &cp) { - size_t bytes; - if (decode_codepoint(s8, l, bytes, cp)) { return bytes; } - return 0; + size_t bytes; + if (decode_codepoint(s8, l, bytes, cp)) { return bytes; } + return 0; } inline char32_t decode_codepoint(const char *s8, size_t l) { - char32_t cp = 0; - decode_codepoint(s8, l, cp); - return cp; + char32_t cp = 0; + decode_codepoint(s8, l, cp); + return cp; } inline std::u32string decode(const char *s8, size_t l) { - std::u32string out; - size_t i = 0; - while (i < l) { - auto beg = i++; - while (i < l && (s8[i] & 0xc0) == 0x80) { - i++; - } - out += decode_codepoint(&s8[beg], (i - beg)); + std::u32string out; + size_t i = 0; + while (i < l) { + auto beg = i++; + while (i < l && (s8[i] & 0xc0) == 0x80) { + i++; } - return out; + out += decode_codepoint(&s8[beg], (i - beg)); + } + return out; } template const char *u8(const T *s) { - return reinterpret_cast(s); + return reinterpret_cast(s); } /*----------------------------------------------------------------------------- @@ -211,23 +219,23 @@ template const char *u8(const T *s) { *---------------------------------------------------------------------------*/ inline std::string escape_characters(const char *s, size_t n) { - std::string str; - for (size_t i = 0; i < n; i++) { - auto c = s[i]; - switch (c) { - case '\f': str += "\\f"; break; - case '\n': str += "\\n"; break; - case '\r': str += "\\r"; break; - case '\t': str += "\\t"; break; - case '\v': str += "\\v"; break; - default: str += c; break; - } + std::string str; + for (size_t i = 0; i < n; i++) { + auto c = s[i]; + switch (c) { + case '\f': str += "\\f"; break; + case '\n': str += "\\n"; break; + case '\r': str += "\\r"; break; + case '\t': str += "\\t"; break; + case '\v': str += "\\v"; break; + default: str += c; break; } - return str; + } + return str; } inline std::string escape_characters(std::string_view sv) { - return escape_characters(sv.data(), sv.size()); + return escape_characters(sv.data(), sv.size()); } /*----------------------------------------------------------------------------- @@ -235,120 +243,121 @@ inline std::string escape_characters(std::string_view sv) { *---------------------------------------------------------------------------*/ inline bool is_hex(char c, int &v) { - if ('0' <= c && c <= '9') { - v = c - '0'; - return true; - } else if ('a' <= c && c <= 'f') { - v = c - 'a' + 10; - return true; - } else if ('A' <= c && c <= 'F') { - v = c - 'A' + 10; - return true; - } - return false; + if ('0' <= c && c <= '9') { + v = c - '0'; + return true; + } else if ('a' <= c && c <= 'f') { + v = c - 'a' + 10; + return true; + } else if ('A' <= c && c <= 'F') { + v = c - 'A' + 10; + return true; + } + return false; } inline bool is_digit(char c, int &v) { - if ('0' <= c && c <= '9') { - v = c - '0'; - return true; - } - return false; + if ('0' <= c && c <= '9') { + v = c - '0'; + return true; + } + return false; } inline std::pair parse_hex_number(const char *s, size_t n, size_t i) { - int ret = 0; - int val; - while (i < n && is_hex(s[i], val)) { - ret = static_cast(ret * 16 + val); - i++; - } - return std::pair(ret, i); + int ret = 0; + int val; + while (i < n && is_hex(s[i], val)) { + ret = static_cast(ret * 16 + val); + i++; + } + return std::pair(ret, i); } inline std::pair parse_octal_number(const char *s, size_t n, size_t i) { - int ret = 0; - int val; - while (i < n && is_digit(s[i], val)) { - ret = static_cast(ret * 8 + val); - i++; - } - return std::pair(ret, i); + int ret = 0; + int val; + while (i < n && is_digit(s[i], val)) { + ret = static_cast(ret * 8 + val); + i++; + } + return std::pair(ret, i); } inline std::string resolve_escape_sequence(const char *s, size_t n) { - std::string r; - r.reserve(n); + std::string r; + r.reserve(n); - size_t i = 0; - while (i < n) { - auto ch = s[i]; - if (ch == '\\') { - i++; - if (i == n) { throw std::runtime_error("Invalid escape sequence..."); } - switch (s[i]) { - case 'f': - r += '\f'; - i++; - break; - case 'n': - r += '\n'; - i++; - break; - case 'r': - r += '\r'; - i++; - break; - case 't': - r += '\t'; - i++; - break; - case 'v': - r += '\v'; - i++; - break; - case '\'': - r += '\''; - i++; - break; - case '"': - r += '"'; - i++; - break; - case '[': - r += '['; - i++; - break; - case ']': - r += ']'; - i++; - break; - case '\\': - r += '\\'; - i++; - break; - case 'x': - case 'u': { - char32_t cp; - std::tie(cp, i) = parse_hex_number(s, n, i + 1); - r += encode_codepoint(cp); - break; - } - default: { - char32_t cp; - std::tie(cp, i) = parse_octal_number(s, n, i); - r += encode_codepoint(cp); - break; - } - } - } else { - r += ch; - i++; - } + size_t i = 0; + while (i < n) { + auto ch = s[i]; + if (ch == '\\') { + i++; + assert(i < n); + + switch (s[i]) { + case 'f': + r += '\f'; + i++; + break; + case 'n': + r += '\n'; + i++; + break; + case 'r': + r += '\r'; + i++; + break; + case 't': + r += '\t'; + i++; + break; + case 'v': + r += '\v'; + i++; + break; + case '\'': + r += '\''; + i++; + break; + case '"': + r += '"'; + i++; + break; + case '[': + r += '['; + i++; + break; + case ']': + r += ']'; + i++; + break; + case '\\': + r += '\\'; + i++; + break; + case 'x': + case 'u': { + char32_t cp; + std::tie(cp, i) = parse_hex_number(s, n, i + 1); + r += encode_codepoint(cp); + break; + } + default: { + char32_t cp; + std::tie(cp, i) = parse_octal_number(s, n, i); + r += encode_codepoint(cp); + break; + } + } + } else { + r += ch; + i++; } - return r; + } + return r; } /*----------------------------------------------------------------------------- @@ -356,19 +365,26 @@ inline std::string resolve_escape_sequence(const char *s, size_t n) { *---------------------------------------------------------------------------*/ template T token_to_number_(std::string_view sv) { - T n = 0; + T n = 0; #if __has_include() - if constexpr (!std::is_floating_point::value) { - std::from_chars(sv.data(), sv.data() + sv.size(), n); + if constexpr (!std::is_floating_point::value) { + std::from_chars(sv.data(), sv.data() + sv.size(), n); #else - if constexpr (false) { + if constexpr (false) { #endif - } else { - auto s = std::string(sv); - std::istringstream ss(s); - ss >> n; - } - return n; + } else { + auto s = std::string(sv); + std::istringstream ss(s); + ss >> n; + } + return n; +} + +inline std::string to_lower(std::string s) { + for (auto &c : s) { + c = static_cast(std::tolower(static_cast(c))); + } + return s; } /*----------------------------------------------------------------------------- @@ -377,75 +393,75 @@ template T token_to_number_(std::string_view sv) { class Trie { public: - Trie(const std::vector &items, bool ignore_case) - : ignore_case_(ignore_case) { - size_t id = 0; - for (const auto &item : items) { - const auto &s = ignore_case ? to_lower(item) : item; - for (size_t len = 1; len <= item.size(); len++) { - auto last = len == item.size(); - std::string_view sv(s.data(), len); - auto it = dic_.find(sv); - if (it == dic_.end()) { - dic_.emplace(sv, Info{last, last, id}); - } else if (last) { - it->second.match = true; - } else { - it->second.done = false; - } - } - id++; + Trie(const std::vector &items, bool ignore_case) + : ignore_case_(ignore_case), items_count_(items.size()) { + size_t id = 0; + for (const auto &item : items) { + const auto &s = ignore_case ? to_lower(item) : item; + if (item.size() > max_len_) { max_len_ = item.size(); } + for (size_t len = 1; len <= item.size(); len++) { + auto last = len == item.size(); + std::string_view sv(s.data(), len); + auto it = dic_.find(sv); + if (it == dic_.end()) { + dic_.emplace(sv, Info{last, last, id}); + } else if (last) { + it->second.match = true; + } else { + it->second.done = false; } + } + id++; + } + } + + size_t match(const char *text, size_t text_len, size_t &id) const { + auto limit = std::min(text_len, max_len_); + std::string lower_text; + if (ignore_case_) { + lower_text = to_lower(std::string(text, limit)); + text = lower_text.data(); } - size_t match(const char *text, size_t text_len, size_t &id) const { - std::string lower_text; - if (ignore_case_) { - lower_text = to_lower(text); - text = lower_text.data(); + size_t match_len = 0; + auto done = false; + size_t len = 1; + while (!done && len <= limit) { + std::string_view sv(text, len); + auto it = dic_.find(sv); + if (it == dic_.end()) { + done = true; + } else { + if (it->second.match) { + match_len = len; + id = it->second.id; } - - size_t match_len = 0; - auto done = false; - size_t len = 1; - while (!done && len <= text_len) { - std::string_view sv(text, len); - auto it = dic_.find(sv); - if (it == dic_.end()) { - done = true; - } else { - if (it->second.match) { - match_len = len; - id = it->second.id; - } - if (it->second.done) { done = true; } - } - len += 1; - } - return match_len; + if (it->second.done) { done = true; } + } + len += 1; } + return match_len; + } - size_t size() const { return dic_.size(); } + size_t size() const { return dic_.size(); } + size_t items_count() const { return items_count_; } + + friend struct ComputeFirstSet; private: - std::string to_lower(std::string s) const { - for (char &c : s) { - c = std::tolower(c); - } - return s; - } + struct Info { + bool done; + bool match; + size_t id; + }; - struct Info { - bool done; - bool match; - size_t id; - }; + // TODO: Use unordered_map when heterogeneous lookup is supported in C++20 + // std::unordered_map dic_; + std::map> dic_; - //! \todo Use unordered_map when heterogeneous lookup is supported in C++20 - //! \todo std::unordered_map dic_; - std::map> dic_; - - bool ignore_case_; + bool ignore_case_; + size_t items_count_; + size_t max_len_ = 0; }; /*----------------------------------------------------------------------------- @@ -456,21 +472,21 @@ private: * Line information utility function */ inline std::pair line_info(const char *start, const char *cur) { - auto p = start; - auto col_ptr = p; - auto no = 1; + auto p = start; + auto col_ptr = p; + auto no = 1; - while (p < cur) { - if (*p == '\n') { - no++; - col_ptr = p + 1; - } - p++; + while (p < cur) { + if (*p == '\n') { + no++; + col_ptr = p + 1; } + p++; + } - auto col = codepoint_count(col_ptr, p - col_ptr) + 1; + auto col = codepoint_count(col_ptr, p - col_ptr) + 1; - return std::pair(no, col); + return std::pair(no, col); } /* @@ -478,19 +494,19 @@ inline std::pair line_info(const char *start, const char *cur) { */ inline constexpr unsigned int str2tag_core(const char *s, size_t l, unsigned int h) { - return (l == 0) ? h - : str2tag_core(s + 1, l - 1, - (h * 33) ^ static_cast(*s)); + return (l == 0) ? h + : str2tag_core(s + 1, l - 1, + (h * 33) ^ static_cast(*s)); } inline constexpr unsigned int str2tag(std::string_view sv) { - return str2tag_core(sv.data(), sv.size(), 0); + return str2tag_core(sv.data(), sv.size(), 0); } namespace udl { -inline constexpr unsigned int operator"" _(const char *s, size_t l) { - return str2tag_core(s, l, 0); +inline constexpr unsigned int operator""_(const char *s, size_t l) { + return str2tag_core(s, l, 0); } } // namespace udl @@ -501,126 +517,113 @@ inline constexpr unsigned int operator"" _(const char *s, size_t l) { class Context; struct SemanticValues : protected std::vector { - SemanticValues() = default; - SemanticValues(Context *c) : c_(c) {} + SemanticValues() = default; + SemanticValues(Context *c) : c_(c) {} - // Input text - const char *path = nullptr; - const char *ss = nullptr; + // Input text + const char *path = nullptr; + const char *ss = nullptr; - // Matched string - std::string_view sv() const { return sv_; } + // Matched string + std::string_view sv() const { return sv_; } - // Definition name - const std::string &name() const { return name_; } + // Definition name + const std::string &name() const { return name_; } - std::vector tags; + std::vector tags; - // Line number and column at which the matched string is - std::pair line_info() const; + // Line number and column at which the matched string is + std::pair line_info() const; - // Choice count - size_t choice_count() const { return choice_count_; } + // Choice count + size_t choice_count() const { return choice_count_; } - // Choice number (0 based index) - size_t choice() const { return choice_; } + // Choice number (0 based index) + size_t choice() const { return choice_; } - // Tokens - std::vector tokens; + // Tokens + std::vector tokens; - std::string_view token(size_t id = 0) const { - if (tokens.empty()) { return sv_; } - assert(id < tokens.size()); - return tokens[id]; + std::string_view token(size_t id = 0) const { + if (tokens.empty()) { return sv_; } + assert(id < tokens.size()); + return tokens[id]; + } + + // Token conversion + std::string token_to_string(size_t id = 0) const { + return std::string(token(id)); + } + + template T token_to_number() const { + return token_to_number_(token()); + } + + // Transform the semantic value vector to another vector + template + std::vector transform(size_t beg = 0, + size_t end = static_cast(-1)) const { + std::vector r; + end = (std::min)(end, size()); + for (size_t i = beg; i < end; i++) { + r.emplace_back(std::any_cast((*this)[i])); } + return r; + } - // Token conversion - std::string token_to_string(size_t id = 0) const { - return std::string(token(id)); - } - - template T token_to_number() const { - return token_to_number_(token()); - } - - // Transform the semantic value vector to another vector - template - std::vector transform(size_t beg = 0, - size_t end = static_cast(-1)) const { - std::vector r; - end = (std::min)(end, size()); - for (size_t i = beg; i < end; i++) { - r.emplace_back(std::any_cast((*this)[i])); - } - return r; - } - - void append(SemanticValues &chvs) { - sv_ = chvs.sv_; - for (auto &v : chvs) { - emplace_back(std::move(v)); - } - for (auto &tag : chvs.tags) { - tags.emplace_back(std::move(tag)); - } - for (auto &tok : chvs.tokens) { - tokens.emplace_back(std::move(tok)); - } - } - - using std::vector::iterator; - using std::vector::const_iterator; - using std::vector::size; - using std::vector::empty; - using std::vector::assign; - using std::vector::begin; - using std::vector::end; - using std::vector::rbegin; - using std::vector::rend; - using std::vector::operator[]; - using std::vector::at; - using std::vector::resize; - using std::vector::front; - using std::vector::back; - using std::vector::push_back; - using std::vector::pop_back; - using std::vector::insert; - using std::vector::erase; - using std::vector::clear; - using std::vector::swap; - using std::vector::emplace; - using std::vector::emplace_back; + using std::vector::iterator; + using std::vector::const_iterator; + using std::vector::size; + using std::vector::empty; + using std::vector::assign; + using std::vector::begin; + using std::vector::end; + using std::vector::rbegin; + using std::vector::rend; + using std::vector::operator[]; + using std::vector::at; + using std::vector::resize; + using std::vector::front; + using std::vector::back; + using std::vector::push_back; + using std::vector::pop_back; + using std::vector::insert; + using std::vector::erase; + using std::vector::clear; + using std::vector::swap; + using std::vector::emplace; + using std::vector::emplace_back; private: - friend class Context; - friend class Dictionary; - friend class Sequence; - friend class PrioritizedChoice; - friend class Repetition; - friend class Holder; - friend class PrecedenceClimbing; + friend class Context; + friend class Dictionary; + friend class Sequence; + friend class PrioritizedChoice; + friend class Repetition; + friend class Holder; + friend class PrecedenceClimbing; - Context *c_ = nullptr; - std::string_view sv_; - size_t choice_count_ = 0; - size_t choice_ = 0; - std::string name_; + Context *c_ = nullptr; + std::string_view sv_; + size_t choice_count_ = 0; + size_t choice_ = 0; + std::string name_; }; /* * Semantic action */ template std::any call(F fn, Args &&...args) { - using R = decltype(fn(std::forward(args)...)); - if constexpr (std::is_void::value) { - fn(std::forward(args)...); - return std::any(); - } else if constexpr (std::is_same::type, - std::any>::value) { - return fn(std::forward(args)...); - } else { - return std::any(fn(std::forward(args)...)); - } + using R = decltype(fn(std::forward(args)...)); + if constexpr (std::is_void::value) { + fn(std::forward(args)...); + return std::any(); + } else if constexpr (std::is_same::type, + std::any>::value) { + return fn(std::forward(args)...); + } else { + return std::any(fn(std::forward(args)...)); + } } template @@ -637,30 +640,74 @@ struct argument_count class Action { public: - Action() = default; - Action(Action &&rhs) = default; - template Action(F fn) : fn_(make_adaptor(fn)) {} - template void operator=(F fn) { fn_ = make_adaptor(fn); } - Action &operator=(const Action &rhs) = default; + Action() = default; + Action(Action &&rhs) = default; + template Action(F fn) : fn_(make_adaptor(fn)) {} + template void operator=(F fn) { fn_ = make_adaptor(fn); } + Action &operator=(const Action &rhs) = default; - operator bool() const { return bool(fn_); } + operator bool() const { return bool(fn_); } - std::any operator()(SemanticValues &vs, std::any &dt) const { - return fn_(vs, dt); - } + std::any operator()(SemanticValues &vs, std::any &dt, + const std::any &predicate_data) const { + return fn_(vs, dt, predicate_data); + } private: - using Fty = std::function; + using Fty = std::function; - template Fty make_adaptor(F fn) { - if constexpr (argument_count::value == 1) { - return [fn](auto &vs, auto & /*dt*/) { return call(fn, vs); }; - } else { - return [fn](auto &vs, auto &dt) { return call(fn, vs, dt); }; - } + template Fty make_adaptor(F fn) { + if constexpr (argument_count::value == 1) { + return [fn](auto &vs, auto & /*dt*/, const auto & /*predicate_data*/) { + return call(fn, vs); + }; + } else if constexpr (argument_count::value == 2) { + return [fn](auto &vs, auto &dt, const auto & /*predicate_data*/) { + return call(fn, vs, dt); + }; + } else { + return [fn](auto &vs, auto &dt, const auto &predicate_data) { + return call(fn, vs, dt, predicate_data); + }; } + } - Fty fn_; + Fty fn_; +}; + +class Predicate { +public: + Predicate() = default; + Predicate(Predicate &&rhs) = default; + template Predicate(F fn) : fn_(make_adaptor(fn)) {} + template void operator=(F fn) { fn_ = make_adaptor(fn); } + Predicate &operator=(const Predicate &rhs) = default; + + operator bool() const { return bool(fn_); } + + bool operator()(const SemanticValues &vs, const std::any &dt, + std::string &msg, std::any &predicate_data) const { + return fn_(vs, dt, msg, predicate_data); + } + +private: + using Fty = std::function; + + template Fty make_adaptor(F fn) { + if constexpr (argument_count::value == 3) { + return [fn](const auto &vs, const auto &dt, auto &msg, + auto & /*predicate_data*/) { return fn(vs, dt, msg); }; + } else { + return [fn](const auto &vs, const auto &dt, auto &msg, + auto &predicate_data) { + return fn(vs, dt, msg, predicate_data); + }; + } + } + + Fty fn_; }; /* @@ -682,67 +729,67 @@ using Log = std::function> expected_tokens; - const char *message_pos = nullptr; - std::string message; - std::string label; - const char *last_output_pos = nullptr; - bool keep_previous_token = false; + const char *error_pos = nullptr; + std::vector> expected_tokens; + const char *message_pos = nullptr; + std::string message; + std::string label; + const char *last_output_pos = nullptr; + bool keep_previous_token = false; - void clear() { - error_pos = nullptr; - expected_tokens.clear(); - message_pos = nullptr; - message.clear(); + void clear() { + error_pos = nullptr; + expected_tokens.clear(); + message_pos = nullptr; + message.clear(); + } + + void add(const char *error_literal, const Definition *error_rule) { + for (const auto &[t, r] : expected_tokens) { + if (t == error_literal && r == error_rule) { return; } } + expected_tokens.emplace_back(error_literal, error_rule); + } - void add(const char *error_literal, const Definition *error_rule) { - for (const auto &[t, r] : expected_tokens) { - if (t == error_literal && r == error_rule) { return; } - } - expected_tokens.emplace_back(error_literal, error_rule); - } - - void output_log(const Log &log, const char *s, size_t n); + void output_log(const Log &log, const char *s, size_t n); private: - int cast_char(char c) const { return static_cast(c); } + int cast_char(char c) const { return static_cast(c); } - std::string heuristic_error_token(const char *s, size_t n, - const char *pos) const { - auto len = n - std::distance(s, pos); - if (len) { - size_t i = 0; - auto c = cast_char(pos[i++]); - if (!std::ispunct(c) && !std::isspace(c)) { - while (i < len && !std::ispunct(cast_char(pos[i])) && - !std::isspace(cast_char(pos[i]))) { - i++; - } - } - - size_t count = CPPPEGLIB_HEURISTIC_ERROR_TOKEN_MAX_CHAR_COUNT; - size_t j = 0; - while (count > 0 && j < i) { - j += codepoint_length(&pos[j], i - j); - count--; - } - - return escape_characters(pos, j); + std::string heuristic_error_token(const char *s, size_t n, + const char *pos) const { + auto len = n - std::distance(s, pos); + if (len) { + size_t i = 0; + auto c = cast_char(pos[i++]); + if (!std::ispunct(c) && !std::isspace(c)) { + while (i < len && !std::ispunct(cast_char(pos[i])) && + !std::isspace(cast_char(pos[i]))) { + i++; } - return std::string(); - } + } - std::string replace_all(std::string str, const std::string &from, - const std::string &to) const { - size_t pos = 0; - while ((pos = str.find(from, pos)) != std::string::npos) { - str.replace(pos, from.length(), to); - pos += to.length(); - } - return str; + size_t count = CPPPEGLIB_HEURISTIC_ERROR_TOKEN_MAX_CHAR_COUNT; + size_t j = 0; + while (count > 0 && j < i) { + j += codepoint_length(&pos[j], i - j); + count--; + } + + return escape_characters(pos, j); } + return std::string(); + } + + std::string replace_all(std::string str, const std::string &from, + const std::string &to) const { + size_t pos = 0; + while ((pos = str.find(from, pos)) != std::string::npos) { + str.replace(pos, from.length(), to); + pos += to.length(); + } + return str; + } }; /* @@ -762,211 +809,269 @@ using TracerStartOrEnd = std::function; class Context { public: - const char *path; - const char *s; - const size_t l; + const char *path; + const char *s; + const size_t l; - ErrorInfo error_info; - bool recovered = false; + ErrorInfo error_info; + bool recovered = false; - std::vector> value_stack; - size_t value_stack_size = 0; + std::vector> value_stack; + size_t value_stack_size = 0; - std::vector rule_stack; - std::vector>> args_stack; + std::vector rule_stack; + std::vector>> args_stack; - size_t in_token_boundary_count = 0; + size_t in_token_boundary_count = 0; - std::shared_ptr whitespaceOpe; - bool in_whitespace = false; + std::shared_ptr whitespaceOpe; + bool in_whitespace = false; - std::shared_ptr wordOpe; + std::shared_ptr wordOpe; - std::vector> capture_scope_stack; - size_t capture_scope_stack_size = 0; + std::vector> capture_entries; - std::vector cut_stack; + std::vector cut_stack; - const size_t def_count; - const bool enablePackratParsing; - std::vector cache_registered; - std::vector cache_success; + const size_t def_count; + const bool enablePackratParsing; + std::vector cache_registered; + std::vector cache_success; - std::map, std::tuple> - cache_values; + std::map, std::tuple> + cache_values; - TracerEnter tracer_enter; - TracerLeave tracer_leave; - std::any trace_data; - const bool verbose_trace; + // Left recursion support + struct LRMemo { + size_t len = static_cast(-1); + std::any val; + }; + std::map, LRMemo> lr_memo; - Log log; + // Rules whose lr_memo was hit during the current parse scope. + // Used to track LR cycle membership. + std::set lr_refs_hit; - Context(const char *path, const char *s, size_t l, size_t def_count, - std::shared_ptr whitespaceOpe, std::shared_ptr wordOpe, - bool enablePackratParsing, TracerEnter tracer_enter, - TracerLeave tracer_leave, std::any trace_data, bool verbose_trace, - Log log) - : path(path), s(s), l(l), whitespaceOpe(whitespaceOpe), wordOpe(wordOpe), - def_count(def_count), enablePackratParsing(enablePackratParsing), - cache_registered(enablePackratParsing ? def_count * (l + 1) : 0), - cache_success(enablePackratParsing ? def_count * (l + 1) : 0), - tracer_enter(tracer_enter), tracer_leave(tracer_leave), - trace_data(trace_data), verbose_trace(verbose_trace), log(log) { + // Rules currently in their seeding/growing phase at a given position. + // Protected from having their lr_memo erased by inner growers. + std::set> lr_active_seeds; - push_args({}); - push_capture_scope(); + void clear_packrat_cache(const char *pos, size_t def_id) { + if (!enablePackratParsing) { return; } + auto col = static_cast(pos - s); + auto idx = def_count * col + def_id; + if (idx < cache_registered.size()) { + cache_registered[idx] = false; + cache_success[idx] = false; + } + cache_values.erase(std::make_pair(col, def_id)); + } + + void write_packrat_cache(const char *pos, size_t def_id, size_t len, + const std::any &val) { + if (!enablePackratParsing) { return; } + auto col = pos - s; + auto idx = def_count * static_cast(col) + def_id; + if (idx >= cache_registered.size()) { return; } + cache_registered[idx] = true; + cache_success[idx] = true; + auto key = std::pair(col, def_id); + cache_values[key] = std::pair(len, val); + } + + TracerEnter tracer_enter; + TracerLeave tracer_leave; + std::any trace_data; + const bool verbose_trace; + + Log log; + + Context(const char *path, const char *s, size_t l, size_t def_count, + std::shared_ptr whitespaceOpe, std::shared_ptr wordOpe, + bool enablePackratParsing, TracerEnter tracer_enter, + TracerLeave tracer_leave, std::any trace_data, bool verbose_trace, + Log log) + : path(path), s(s), l(l), whitespaceOpe(whitespaceOpe), wordOpe(wordOpe), + def_count(def_count), enablePackratParsing(enablePackratParsing), + cache_registered(enablePackratParsing ? def_count * (l + 1) : 0), + cache_success(enablePackratParsing ? def_count * (l + 1) : 0), + tracer_enter(tracer_enter), tracer_leave(tracer_leave), + trace_data(trace_data), verbose_trace(verbose_trace), log(log) { + + push_args({}); + } + + ~Context() { + assert(!value_stack_size); + assert(cut_stack.empty()); + } + + Context(const Context &) = delete; + Context(Context &&) = delete; + Context operator=(const Context &) = delete; + + // Per-rule packrat stats (populated when packrat_stats is non-null) + struct PackratStats { + size_t hits = 0; + size_t misses = 0; + }; + std::vector *packrat_stats = nullptr; + + // Per-rule packrat filter: if set, only rules with filter[def_id]=true + // use full memoization (cache_values map). Others use bitvector-only + // re-entry guard. + const std::vector *packrat_rule_filter = nullptr; + + template + void packrat(const char *a_s, size_t def_id, size_t &len, std::any &val, + T fn) { + if (!enablePackratParsing) { + fn(val); + return; } - ~Context() { - pop_capture_scope(); + auto col = a_s - s; + auto idx = def_count * static_cast(col) + def_id; - assert(!value_stack_size); - assert(!capture_scope_stack_size); - assert(cut_stack.empty()); + if (cache_registered[idx]) { + if (packrat_stats && def_id < packrat_stats->size()) { + (*packrat_stats)[def_id].hits++; + } + if (cache_success[idx]) { + auto key = std::pair(col, def_id); + std::tie(len, val) = cache_values[key]; + return; + } else { + len = static_cast(-1); + return; + } + } else { + // Pre-register as failure (re-entry guard for all rules) + cache_registered[idx] = true; + cache_success[idx] = false; + + if (packrat_stats && def_id < packrat_stats->size()) { + (*packrat_stats)[def_id].misses++; + } + + fn(val); + + bool full_memo = + !packrat_rule_filter || (def_id < packrat_rule_filter->size() && + (*packrat_rule_filter)[def_id]); + if (full_memo) { + if (success(len)) { write_packrat_cache(a_s, def_id, len, val); } + } else { + // Guard-only: undo registration so future calls re-parse + cache_registered[idx] = false; + } + return; + } + } + + // Semantic values + SemanticValues &push_semantic_values_scope() { + assert(value_stack_size <= value_stack.size()); + if (value_stack_size == value_stack.size()) { + value_stack.emplace_back(std::make_shared(this)); + } else { + auto &vs = *value_stack[value_stack_size]; + if (!vs.empty()) { + vs.clear(); + if (!vs.tags.empty()) { vs.tags.clear(); } + } + vs.sv_ = std::string_view(); + vs.choice_count_ = 0; + vs.choice_ = 0; + if (!vs.tokens.empty()) { vs.tokens.clear(); } } - Context(const Context &) = delete; - Context(Context &&) = delete; - Context operator=(const Context &) = delete; + auto &vs = *value_stack[value_stack_size++]; + vs.path = path; + vs.ss = s; + return vs; + } - template - void packrat(const char *a_s, size_t def_id, size_t &len, std::any &val, - T fn) { - if (!enablePackratParsing) { - fn(val); - return; - } + void pop_semantic_values_scope() { value_stack_size--; } - auto col = a_s - s; - auto idx = def_count * static_cast(col) + def_id; + // Arguments + void push_args(std::vector> &&args) { + args_stack.emplace_back(std::move(args)); + } - if (cache_registered[idx]) { - if (cache_success[idx]) { - auto key = std::pair(col, def_id); - std::tie(len, val) = cache_values[key]; - return; - } else { - len = static_cast(-1); - return; - } - } else { - fn(val); - cache_registered[idx] = true; - cache_success[idx] = success(len); - if (success(len)) { - auto key = std::pair(col, def_id); - cache_values[key] = std::pair(len, val); - } - return; - } - } + void pop_args() { args_stack.pop_back(); } - SemanticValues &push() { - push_capture_scope(); - return push_semantic_values_scope(); - } + const std::vector> &top_args() const { + return args_stack[args_stack.size() - 1]; + } - void pop() { - pop_capture_scope(); - pop_semantic_values_scope(); - } + // Snapshot/Rollback + struct Snapshot { + size_t sv_size; + size_t sv_tags_size; + size_t sv_tokens_size; + std::string_view sv_sv; + size_t choice_count; + size_t choice; + size_t capture_size; + }; - // Semantic values - SemanticValues &push_semantic_values_scope() { - assert(value_stack_size <= value_stack.size()); - if (value_stack_size == value_stack.size()) { - value_stack.emplace_back(std::make_shared(this)); - } else { - auto &vs = *value_stack[value_stack_size]; - if (!vs.empty()) { - vs.clear(); - if (!vs.tags.empty()) { vs.tags.clear(); } - } - vs.sv_ = std::string_view(); - vs.choice_count_ = 0; - vs.choice_ = 0; - if (!vs.tokens.empty()) { vs.tokens.clear(); } - } + Snapshot snapshot(const SemanticValues &vs) const { + return {vs.size(), vs.tags.size(), vs.tokens.size(), vs.sv_, + vs.choice_count_, vs.choice_, capture_entries.size()}; + } - auto &vs = *value_stack[value_stack_size++]; - vs.path = path; - vs.ss = s; - return vs; - } + void rollback(SemanticValues &vs, const Snapshot &snap) { + vs.resize(snap.sv_size); + vs.tags.resize(snap.sv_tags_size); + vs.tokens.resize(snap.sv_tokens_size); + vs.sv_ = snap.sv_sv; + vs.choice_count_ = snap.choice_count; + vs.choice_ = snap.choice; + capture_entries.resize(snap.capture_size); + } - void pop_semantic_values_scope() { value_stack_size--; } + // Skip trailing whitespace with trace suppression. + // Returns whitespace length, or -1 on failure. + // No-op (returns 0) if inside a token boundary or no whitespaceOpe. + size_t skip_whitespace(const char *a_s, size_t n, SemanticValues &vs, + std::any &dt); - // Arguments - void push_args(std::vector> &&args) { - args_stack.emplace_back(args); - } + // Error + void set_error_pos(const char *a_s, const char *literal = nullptr); - void pop_args() { args_stack.pop_back(); } + // Trace + void trace_enter(const Ope &ope, const char *a_s, size_t n, + const SemanticValues &vs, std::any &dt); + void trace_leave(const Ope &ope, const char *a_s, size_t n, + const SemanticValues &vs, std::any &dt, size_t len); + bool is_traceable(const Ope &ope) const; - const std::vector> &top_args() const { - return args_stack[args_stack.size() - 1]; - } + // Line info + std::pair line_info(const char *cur) const { + std::call_once(source_line_index_init_, [this]() { + for (size_t pos = 0; pos < l; pos++) { + if (s[pos] == '\n') { source_line_index.push_back(pos); } + } + source_line_index.push_back(l); + }); - // Capture scope - void push_capture_scope() { - assert(capture_scope_stack_size <= capture_scope_stack.size()); - if (capture_scope_stack_size == capture_scope_stack.size()) { - capture_scope_stack.emplace_back( - std::map()); - } else { - auto &cs = capture_scope_stack[capture_scope_stack_size]; - if (!cs.empty()) { cs.clear(); } - } - capture_scope_stack_size++; - } + auto pos = static_cast(std::distance(s, cur)); - void pop_capture_scope() { capture_scope_stack_size--; } + auto it = std::lower_bound( + source_line_index.begin(), source_line_index.end(), pos, + [](size_t element, size_t value) { return element < value; }); - void shift_capture_values() { - assert(capture_scope_stack_size >= 2); - auto curr = &capture_scope_stack[capture_scope_stack_size - 1]; - auto prev = curr - 1; - for (const auto &[k, v] : *curr) { - (*prev)[k] = v; - } - } + auto id = static_cast(std::distance(source_line_index.begin(), it)); + auto off = pos - (id == 0 ? 0 : source_line_index[id - 1] + 1); + return std::pair(id + 1, off + 1); + } - // Error - void set_error_pos(const char *a_s, const char *literal = nullptr); - - // Trace - void trace_enter(const Ope &ope, const char *a_s, size_t n, - const SemanticValues &vs, std::any &dt); - void trace_leave(const Ope &ope, const char *a_s, size_t n, - const SemanticValues &vs, std::any &dt, size_t len); - bool is_traceable(const Ope &ope) const; - - // Line info - std::pair line_info(const char *cur) const { - std::call_once(source_line_index_init_, [this]() { - for (size_t pos = 0; pos < l; pos++) { - if (s[pos] == '\n') { source_line_index.push_back(pos); } - } - source_line_index.push_back(l); - }); - - auto pos = static_cast(std::distance(s, cur)); - - auto it = std::lower_bound( - source_line_index.begin(), source_line_index.end(), pos, - [](size_t element, size_t value) { return element < value; }); - - auto id = static_cast(std::distance(source_line_index.begin(), it)); - auto off = pos - (id == 0 ? 0 : source_line_index[id - 1] + 1); - return std::pair(id + 1, off + 1); - } - - size_t next_trace_id = 0; - std::vector trace_ids; - bool ignore_trace_state = false; - mutable std::once_flag source_line_index_init_; - mutable std::vector source_line_index; + size_t next_trace_id = 0; + std::vector trace_ids; + bool ignore_trace_state = false; + mutable std::once_flag source_line_index_init_; + mutable std::vector source_line_index; }; /* @@ -974,416 +1079,590 @@ public: */ class Ope { public: - struct Visitor; + struct Visitor; - virtual ~Ope() = default; - size_t parse(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const; - virtual size_t parse_core(const char *s, size_t n, SemanticValues &vs, - Context &c, std::any &dt) const = 0; - virtual void accept(Visitor &v) = 0; + virtual ~Ope() = default; + size_t parse(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const; + virtual size_t parse_core(const char *s, size_t n, SemanticValues &vs, + Context &c, std::any &dt) const = 0; + virtual void accept(Visitor &v) = 0; + + bool is_token_boundary = false; + bool is_choice_like = false; +}; + +// Keyword-guarded identifier data, heap-allocated only for matching Sequences. +// Avoids bloating all Sequence objects with bitsets and keyword sets. +struct KeywordGuardData { + std::bitset<256> identifier_first; // first char of identifier + std::bitset<256> identifier_rest; // subsequent chars of identifier + std::vector exact_keywords; // single-word keywords (lowercase) + std::vector prefix_keywords; // first word of compound keywords + size_t min_keyword_len = 0; + size_t max_keyword_len = 0; + + static bool matches_any(const std::vector &keywords, + std::string_view input) { + return std::any_of(keywords.begin(), keywords.end(), + [&](const auto &kw) { return kw == input; }); + } }; class Sequence : public Ope { public: - template - Sequence(const Args &...args) - : opes_{static_cast>(args)...} {} - Sequence(const std::vector> &opes) : opes_(opes) {} - Sequence(std::vector> &&opes) : opes_(opes) {} + template + Sequence(const Args &...args) + : opes_{static_cast>(args)...} {} + Sequence(const std::vector> &opes) : opes_(opes) {} + Sequence(std::vector> &&opes) : opes_(std::move(opes)) {} - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override { - auto &chvs = c.push_semantic_values_scope(); - auto se = scope_exit([&]() { c.pop_semantic_values_scope(); }); - size_t i = 0; - for (const auto &ope : opes_) { - auto len = ope->parse(s + i, n - i, chvs, c, dt); - if (fail(len)) { return len; } - i += len; - } - vs.append(chvs); - return i; + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override { + // Keyword-guarded identifier fast path: + // Fuses !ReservedKeyword into scan-then-lookup + if (kw_guard_) { + if (auto result = parse_keyword_guarded(s, n, vs, c, dt)) { + return *result; + } + // nullopt means prefix keyword match — fall through to normal path } + size_t i = 0; + for (const auto &ope : opes_) { + auto len = ope->parse(s + i, n - i, vs, c, dt); + if (fail(len)) { return len; } + i += len; + } + return i; + } - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::vector> opes_; + std::vector> opes_; + +private: + friend struct SetupFirstSets; + std::unique_ptr kw_guard_; + + // Returns parse result, or nullopt to fall through to normal path + std::optional parse_keyword_guarded(const char *s, size_t n, + SemanticValues &vs, Context &c, + std::any &dt) const { + const auto &kw = *kw_guard_; + if (n < 1 || !kw.identifier_first.test(static_cast(*s))) { + c.set_error_pos(s); + return static_cast(-1); + } + // Scan identifier using bitset + size_t id_len = 1; + while (id_len < n && + kw.identifier_rest.test(static_cast(s[id_len]))) { + id_len++; + } + // Skip keyword matching if identifier length is out of range + if (id_len >= kw.min_keyword_len && id_len <= kw.max_keyword_len) { + char lower_buf[64]; + std::unique_ptr lower_heap; + char *lower = lower_buf; + if (id_len > sizeof(lower_buf)) { + lower_heap.reset(new char[id_len]); + lower = lower_heap.get(); + } + std::transform(s, s + id_len, lower, [](unsigned char ch) { + return static_cast(std::tolower(ch)); + }); + std::string_view lower_sv(lower, id_len); + + if (KeywordGuardData::matches_any(kw.exact_keywords, lower_sv)) { + c.set_error_pos(s); + return static_cast(-1); + } + if (KeywordGuardData::matches_any(kw.prefix_keywords, lower_sv)) { + return std::nullopt; + } + } + // Success: emit token and consume trailing whitespace + vs.tokens.emplace_back(std::string_view(s, id_len)); + auto wl = c.skip_whitespace(s + id_len, n - id_len, vs, dt); + if (fail(wl)) { return wl; } + return id_len + wl; + } +}; + +struct FirstSet { + // First-Set: set of possible first bytes for an expression. + // Used by PrioritizedChoice to skip alternatives that cannot match. + std::bitset<256> chars; // byte values that can appear as the first byte + bool can_be_empty = false; // true if the expression can match empty string + bool any_char = false; // true if any character can appear (cannot filter) + const char *first_literal = nullptr; // first literal for error reporting + const Definition *first_rule = + nullptr; // first token rule for error reporting + + void merge(const FirstSet &other) { + chars |= other.chars; + if (other.can_be_empty) { can_be_empty = true; } + if (other.any_char) { any_char = true; } + // Note: first_literal/first_rule are NOT merged — per-alternative + } }; class PrioritizedChoice : public Ope { public: - template - PrioritizedChoice(bool for_label, const Args &...args) - : opes_{static_cast>(args)...}, - for_label_(for_label) {} - PrioritizedChoice(const std::vector> &opes) - : opes_(opes) {} - PrioritizedChoice(std::vector> &&opes) : opes_(opes) {} + template + PrioritizedChoice(bool for_label, const Args &...args) + : opes_{static_cast>(args)...}, + for_label_(for_label) { + is_choice_like = true; + } + PrioritizedChoice(const std::vector> &opes) + : opes_(opes) { + is_choice_like = true; + } + PrioritizedChoice(std::vector> &&opes) + : opes_(std::move(opes)) { + is_choice_like = true; + } - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override { - size_t len = static_cast(-1); + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override { + size_t len = static_cast(-1); - if (!for_label_) { c.cut_stack.push_back(false); } - auto se1 = scope_exit([&]() { - if (!for_label_) { c.cut_stack.pop_back(); } - }); + if (!for_label_) { c.cut_stack.push_back(false); } + auto se = scope_exit([&]() { + if (!for_label_) { c.cut_stack.pop_back(); } + }); - size_t id = 0; - for (const auto &ope : opes_) { - if (!c.cut_stack.empty()) { c.cut_stack.back() = false; } - - auto &chvs = c.push(); - c.error_info.keep_previous_token = id > 0; - auto se2 = scope_exit([&]() { - c.pop(); - c.error_info.keep_previous_token = false; - }); - - len = ope->parse(s, n, chvs, c, dt); - - if (success(len)) { - vs.append(chvs); - vs.choice_count_ = opes_.size(); - vs.choice_ = id; - c.shift_capture_values(); - break; - } else if (!c.cut_stack.empty() && c.cut_stack.back()) { - break; + size_t id = 0; + for (const auto &ope : opes_) { + // First-Set filtering: skip if next byte cannot start this alternative + if (n > 0 && id < first_sets_.size()) { + const auto &fs = first_sets_[id]; + if (!fs.any_char && !fs.can_be_empty && + !fs.chars.test(static_cast(*s))) { + if (c.log && (fs.first_literal || fs.first_rule)) { + if (c.error_info.error_pos <= s) { + if (c.error_info.error_pos < s || !(id > 0)) { + c.error_info.error_pos = s; + c.error_info.expected_tokens.clear(); + } + if (fs.first_literal) { + c.error_info.add(fs.first_literal, nullptr); + } else { + c.error_info.add(nullptr, fs.first_rule); + } } - - id++; + } + id++; + continue; } + } - return len; + if (!c.cut_stack.empty()) { c.cut_stack.back() = false; } + + auto snap = c.snapshot(vs); + c.error_info.keep_previous_token = id > 0; + + len = ope->parse(s, n, vs, c, dt); + + if (success(len)) { + vs.choice_count_ = opes_.size(); + vs.choice_ = id; + break; + } + + c.rollback(vs, snap); + + if (!c.cut_stack.empty() && c.cut_stack.back()) { break; } + + id++; } - void accept(Visitor &v) override; + c.error_info.keep_previous_token = false; + return len; + } - size_t size() const { return opes_.size(); } + void accept(Visitor &v) override; - std::vector> opes_; - bool for_label_ = false; + size_t size() const { return opes_.size(); } + + std::vector> opes_; + bool for_label_ = false; + std::vector first_sets_; }; class Repetition : public Ope { public: - Repetition(const std::shared_ptr &ope, size_t min, size_t max) - : ope_(ope), min_(min), max_(max) {} + Repetition(const std::shared_ptr &ope, size_t min, size_t max) + : ope_(ope), min_(min), max_(max) {} - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override { - size_t count = 0; - size_t i = 0; - while (count < min_) { - auto &chvs = c.push(); - auto se = scope_exit([&]() { c.pop(); }); - - auto len = ope_->parse(s + i, n - i, chvs, c, dt); - - if (success(len)) { - vs.append(chvs); - c.shift_capture_values(); - } else { - return len; - } - i += len; - count++; + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override { + // ISpan fast path: tight loop for ASCII CharacterClass repetition. + // Safe because each ASCII match is exactly 1 byte, so byte count == match + // count. + if (span_bitset_) { + const auto &bitset = *span_bitset_; + size_t i = 0; + if (max_ == std::numeric_limits::max()) { + // Unbounded repetition (*, +): no per-iteration max check + while (i < n && bitset.test(static_cast(s[i]))) { + i++; } - - while (count < max_) { - auto &chvs = c.push(); - auto se = scope_exit([&]() { c.pop(); }); - - auto len = ope_->parse(s + i, n - i, chvs, c, dt); - - if (success(len)) { - vs.append(chvs); - c.shift_capture_values(); - } else { - break; - } - i += len; - count++; + } else { + auto limit = std::min(n, max_); + while (i < limit && bitset.test(static_cast(s[i]))) { + i++; } - return i; + } + if (i < min_) { + c.set_error_pos(s + i); + return static_cast(-1); + } + return i; } - void accept(Visitor &v) override; - - bool is_zom() const { - return min_ == 0 && max_ == std::numeric_limits::max(); + size_t count = 0; + size_t i = 0; + while (count < min_) { + auto len = ope_->parse(s + i, n - i, vs, c, dt); + if (fail(len)) { return len; } + i += len; + count++; } - static std::shared_ptr zom(const std::shared_ptr &ope) { - return std::make_shared(ope, 0, - std::numeric_limits::max()); + while (count < max_) { + auto snap = c.snapshot(vs); + auto len = ope_->parse(s + i, n - i, vs, c, dt); + if (fail(len)) { + c.rollback(vs, snap); + break; + } + i += len; + count++; } + return i; + } - static std::shared_ptr oom(const std::shared_ptr &ope) { - return std::make_shared(ope, 1, - std::numeric_limits::max()); - } + void accept(Visitor &v) override; - static std::shared_ptr opt(const std::shared_ptr &ope) { - return std::make_shared(ope, 0, 1); - } + bool is_zom() const { + return min_ == 0 && max_ == std::numeric_limits::max(); + } - std::shared_ptr ope_; - size_t min_; - size_t max_; + static std::shared_ptr zom(const std::shared_ptr &ope) { + return std::make_shared(ope, 0, + std::numeric_limits::max()); + } + + static std::shared_ptr oom(const std::shared_ptr &ope) { + return std::make_shared(ope, 1, + std::numeric_limits::max()); + } + + static std::shared_ptr opt(const std::shared_ptr &ope) { + return std::make_shared(ope, 0, 1); + } + + std::shared_ptr ope_; + size_t min_; + size_t max_; + const std::bitset<256> *span_bitset_ = + nullptr; // non-owning, set by SetupFirstSets }; class AndPredicate : public Ope { public: - AndPredicate(const std::shared_ptr &ope) : ope_(ope) {} + AndPredicate(const std::shared_ptr &ope) : ope_(ope) {} - size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, - Context &c, std::any &dt) const override { - auto &chvs = c.push(); - auto se = scope_exit([&]() { c.pop(); }); - - auto len = ope_->parse(s, n, chvs, c, dt); - - if (success(len)) { - return 0; - } else { - return len; - } + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override { + auto snap = c.snapshot(vs); + auto len = ope_->parse(s, n, vs, c, dt); + c.rollback(vs, snap); // Always rollback — predicates consume nothing + if (success(len)) { + return 0; + } else { + return len; } + } - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::shared_ptr ope_; + std::shared_ptr ope_; }; class NotPredicate : public Ope { public: - NotPredicate(const std::shared_ptr &ope) : ope_(ope) {} + NotPredicate(const std::shared_ptr &ope) : ope_(ope) {} - size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, - Context &c, std::any &dt) const override { - auto &chvs = c.push(); - auto se = scope_exit([&]() { c.pop(); }); - auto len = ope_->parse(s, n, chvs, c, dt); - if (success(len)) { - c.set_error_pos(s); - return static_cast(-1); - } else { - return 0; - } + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override { + auto snap = c.snapshot(vs); + auto len = ope_->parse(s, n, vs, c, dt); + c.rollback(vs, snap); // Always rollback — predicates consume nothing + if (success(len)) { + c.set_error_pos(s); + return static_cast(-1); + } else { + return 0; } + } - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::shared_ptr ope_; + std::shared_ptr ope_; }; class Dictionary : public Ope, public std::enable_shared_from_this { public: - Dictionary(const std::vector &v, bool ignore_case) - : trie_(v, ignore_case) {} + Dictionary(const std::vector &v, bool ignore_case) + : trie_(v, ignore_case) { + is_choice_like = true; + } - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override; + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override; - void accept(Visitor &v) override; + void accept(Visitor &v) override; - Trie trie_; + Trie trie_; }; class LiteralString : public Ope, public std::enable_shared_from_this { public: - LiteralString(std::string &&s, bool ignore_case) - : lit_(s), ignore_case_(ignore_case), is_word_(false) {} + LiteralString(std::string &&s, bool ignore_case) + : lit_(std::move(s)), ignore_case_(ignore_case), + lower_lit_(ignore_case ? to_lower(lit_) : std::string()), + is_word_(false) {} - LiteralString(const std::string &s, bool ignore_case) - : lit_(s), ignore_case_(ignore_case), is_word_(false) {} + LiteralString(const std::string &s, bool ignore_case) + : lit_(s), ignore_case_(ignore_case), + lower_lit_(ignore_case ? to_lower(lit_) : std::string()), + is_word_(false) {} - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override; + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override; - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::string lit_; - bool ignore_case_; - mutable std::once_flag init_is_word_; - mutable bool is_word_; + std::string lit_; + bool ignore_case_; + std::string lower_lit_; // pre-computed for ignore_case + mutable std::once_flag init_is_word_; + mutable bool is_word_; }; class CharacterClass : public Ope, public std::enable_shared_from_this { public: - CharacterClass(const std::string &s, bool negated, bool ignore_case) - : negated_(negated), ignore_case_(ignore_case) { - auto chars = decode(s.data(), s.length()); - auto i = 0u; - while (i < chars.size()) { - if (i + 2 < chars.size() && chars[i + 1] == '-') { - auto cp1 = chars[i]; - auto cp2 = chars[i + 2]; - ranges_.emplace_back(std::pair(cp1, cp2)); - i += 3; - } else { - auto cp = chars[i]; - ranges_.emplace_back(std::pair(cp, cp)); - i += 1; - } - } - assert(!ranges_.empty()); + CharacterClass(const std::string &s, bool negated, bool ignore_case) + : negated_(negated), ignore_case_(ignore_case) { + auto chars = decode(s.data(), s.length()); + auto i = 0u; + while (i < chars.size()) { + if (i + 2 < chars.size() && chars[i + 1] == '-') { + auto cp1 = chars[i]; + auto cp2 = chars[i + 2]; + ranges_.emplace_back(std::pair(cp1, cp2)); + i += 3; + } else { + auto cp = chars[i]; + ranges_.emplace_back(std::pair(cp, cp)); + i += 1; + } + } + assert(!ranges_.empty()); + setup_ascii_bitset(); + } + + CharacterClass(const std::vector> &ranges, + bool negated, bool ignore_case) + : ranges_(ranges), negated_(negated), ignore_case_(ignore_case) { + assert(!ranges_.empty()); + setup_ascii_bitset(); + } + + size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, + Context &c, std::any & /*dt*/) const override { + if (n < 1) { + c.set_error_pos(s); + return static_cast(-1); } - CharacterClass(const std::vector> &ranges, - bool negated, bool ignore_case) - : ranges_(ranges), negated_(negated), ignore_case_(ignore_case) { - assert(!ranges_.empty()); - } - - size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, - Context &c, std::any & /*dt*/) const override { - if (n < 1) { - c.set_error_pos(s); - return static_cast(-1); - } - - char32_t cp = 0; - auto len = decode_codepoint(s, n, cp); - - for (const auto &range : ranges_) { - if (in_range(range, cp)) { - if (negated_) { - c.set_error_pos(s); - return static_cast(-1); - } else { - return len; - } - } - } + char32_t cp = 0; + auto len = decode_codepoint(s, n, cp); + for (const auto &range : ranges_) { + if (in_range(range, cp)) { if (negated_) { - return len; + c.set_error_pos(s); + return static_cast(-1); } else { - c.set_error_pos(s); - return static_cast(-1); + return len; } + } } - void accept(Visitor &v) override; + if (negated_) { + return len; + } else { + c.set_error_pos(s); + return static_cast(-1); + } + } + + void accept(Visitor &v) override; + + friend struct ComputeFirstSet; + + bool is_ascii_only() const { return is_ascii_only_; } + const std::bitset<256> &ascii_bitset() const { return ascii_bitset_; } private: - bool in_range(const std::pair &range, char32_t cp) const { - if (ignore_case_) { - auto cpl = std::tolower(cp); - return std::tolower(range.first) <= cpl && - cpl <= std::tolower(range.second); - } else { - return range.first <= cp && cp <= range.second; - } + bool in_range(const std::pair &range, char32_t cp) const { + if (ignore_case_) { + auto cpl = std::tolower(cp); + return std::tolower(range.first) <= cpl && + cpl <= std::tolower(range.second); + } else { + return range.first <= cp && cp <= range.second; } + } - std::vector> ranges_; - bool negated_; - bool ignore_case_; + void setup_ascii_bitset() { + if (negated_) { return; } // negated classes can match non-ASCII + for (const auto &[lo, hi] : ranges_) { + if (lo > 0x7F || hi > 0x7F) { return; } + } + is_ascii_only_ = true; + for (const auto &[lo, hi] : ranges_) { + for (auto cp = lo; cp <= hi; cp++) { + auto ch = static_cast(cp); + ascii_bitset_.set(ch); + if (ignore_case_) { + ascii_bitset_.set(static_cast(std::toupper(ch))); + ascii_bitset_.set(static_cast(std::tolower(ch))); + } + } + } + } + + std::vector> ranges_; + bool negated_; + bool ignore_case_; + std::bitset<256> ascii_bitset_; + bool is_ascii_only_ = false; }; class Character : public Ope, public std::enable_shared_from_this { public: - Character(char ch) : ch_(ch) {} + Character(char32_t ch) : ch_(ch) {} - size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, - Context &c, std::any & /*dt*/) const override { - if (n < 1 || s[0] != ch_) { - c.set_error_pos(s); - return static_cast(-1); - } - return 1; + size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, + Context &c, std::any & /*dt*/) const override { + if (n < 1) { + c.set_error_pos(s); + return static_cast(-1); } - void accept(Visitor &v) override; + char32_t cp = 0; + auto len = decode_codepoint(s, n, cp); - char ch_; + if (cp != ch_) { + c.set_error_pos(s); + return static_cast(-1); + } + return len; + } + + void accept(Visitor &v) override; + + char32_t ch_; }; class AnyCharacter : public Ope, public std::enable_shared_from_this { public: - size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, - Context &c, std::any & /*dt*/) const override { - auto len = codepoint_length(s, n); - if (len < 1) { - c.set_error_pos(s); - return static_cast(-1); - } - return len; + size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, + Context &c, std::any & /*dt*/) const override { + auto len = codepoint_length(s, n); + if (len < 1) { + c.set_error_pos(s); + return static_cast(-1); } + return len; + } - void accept(Visitor &v) override; + void accept(Visitor &v) override; }; class CaptureScope : public Ope { public: - CaptureScope(const std::shared_ptr &ope) : ope_(ope) {} + CaptureScope(const std::shared_ptr &ope) : ope_(ope) {} - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override { - c.push_capture_scope(); - auto se = scope_exit([&]() { c.pop_capture_scope(); }); - return ope_->parse(s, n, vs, c, dt); - } + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override { + auto cap_snap = c.capture_entries.size(); + auto len = ope_->parse(s, n, vs, c, dt); + c.capture_entries.resize(cap_snap); // Always rollback (isolation) + return len; + } - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::shared_ptr ope_; + std::shared_ptr ope_; }; class Capture : public Ope { public: - using MatchAction = std::function; + using MatchAction = std::function; - Capture(const std::shared_ptr &ope, MatchAction ma) - : ope_(ope), match_action_(ma) {} + Capture(const std::shared_ptr &ope, MatchAction ma) + : ope_(ope), match_action_(ma) {} - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override { - auto len = ope_->parse(s, n, vs, c, dt); - if (success(len) && match_action_) { match_action_(s, len, c); } - return len; - } + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override { + auto len = ope_->parse(s, n, vs, c, dt); + if (success(len) && match_action_) { match_action_(s, len, c); } + return len; + } - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::shared_ptr ope_; - MatchAction match_action_; + std::shared_ptr ope_; + MatchAction match_action_; }; class TokenBoundary : public Ope { public: - TokenBoundary(const std::shared_ptr &ope) : ope_(ope) {} + TokenBoundary(const std::shared_ptr &ope) : ope_(ope) { + is_token_boundary = true; + } - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override; + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override; - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::shared_ptr ope_; + std::shared_ptr ope_; }; class Ignore : public Ope { public: - Ignore(const std::shared_ptr &ope) : ope_(ope) {} + Ignore(const std::shared_ptr &ope) : ope_(ope) {} - size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, - Context &c, std::any &dt) const override { - auto &chvs = c.push_semantic_values_scope(); - auto se = scope_exit([&]() { c.pop_semantic_values_scope(); }); - return ope_->parse(s, n, chvs, c, dt); - } + size_t parse_core(const char *s, size_t n, SemanticValues & /*vs*/, + Context &c, std::any &dt) const override { + auto &chvs = c.push_semantic_values_scope(); + auto se = scope_exit([&]() { c.pop_semantic_values_scope(); }); + return ope_->parse(s, n, chvs, c, dt); + } - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::shared_ptr ope_; + std::shared_ptr ope_; }; using Parser = std::function - fn_; + User(Parser fn) : fn_(fn) {} + size_t parse_core(const char *s, size_t n, SemanticValues &vs, + Context & /*c*/, std::any &dt) const override { + assert(fn_); + return fn_(s, n, vs, dt); + } + void accept(Visitor &v) override; + std::function + fn_; }; class WeakHolder : public Ope { public: - WeakHolder(const std::shared_ptr &ope) : weak_(ope) {} + WeakHolder(const std::shared_ptr &ope) : weak_(ope) {} - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override { - auto ope = weak_.lock(); - assert(ope); - return ope->parse(s, n, vs, c, dt); - } + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override { + auto ope = weak_.lock(); + assert(ope); + return ope->parse(s, n, vs, c, dt); + } - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::weak_ptr weak_; + std::weak_ptr weak_; }; class Holder : public Ope { public: - Holder(Definition *outer) : outer_(outer) {} + Holder(Definition *outer) : outer_(outer) {} - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override; + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override; - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::any reduce(SemanticValues &vs, std::any &dt) const; + std::any reduce(SemanticValues &vs, std::any &dt, + const std::any &predicate_data) const; - const std::string &name() const; - const std::string &trace_name() const; + const std::string &name() const; + const std::string &trace_name() const; - std::shared_ptr ope_; - Definition *outer_; - mutable std::once_flag trace_name_init_; - mutable std::string trace_name_; + std::shared_ptr ope_; + Definition *outer_; + mutable std::once_flag trace_name_init_; + mutable std::string trace_name_; - friend class Definition; + friend class Definition; }; using Grammar = std::unordered_map; class Reference : public Ope, public std::enable_shared_from_this { public: - Reference(const Grammar &grammar, const std::string &name, const char *s, - bool is_macro, const std::vector> &args) - : grammar_(grammar), name_(name), s_(s), is_macro_(is_macro), args_(args), - rule_(nullptr), iarg_(0) {} + Reference(const Grammar &grammar, const std::string &name, const char *s, + bool is_macro, const std::vector> &args) + : grammar_(grammar), name_(name), s_(s), is_macro_(is_macro), args_(args), + rule_(nullptr), iarg_(0) {} - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override; + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override; - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::shared_ptr get_core_operator() const; + std::shared_ptr get_core_operator() const; - const Grammar &grammar_; - const std::string name_; - const char *s_; + const Grammar &grammar_; + const std::string name_; + const char *s_; - const bool is_macro_; - const std::vector> args_; + const bool is_macro_; + const std::vector> args_; - Definition *rule_; - size_t iarg_; + Definition *rule_; + size_t iarg_; }; class Whitespace : public Ope { public: - Whitespace(const std::shared_ptr &ope) : ope_(ope) {} + Whitespace(const std::shared_ptr &ope) : ope_(ope) {} - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override { - if (c.in_whitespace) { return 0; } - c.in_whitespace = true; - auto se = scope_exit([&]() { c.in_whitespace = false; }); - return ope_->parse(s, n, vs, c, dt); - } + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override { + if (c.in_whitespace) { return 0; } + c.in_whitespace = true; + auto se = scope_exit([&]() { c.in_whitespace = false; }); + return ope_->parse(s, n, vs, c, dt); + } - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::shared_ptr ope_; + std::shared_ptr ope_; }; class BackReference : public Ope { public: - BackReference(std::string &&name) : name_(name) {} + BackReference(std::string &&name) : name_(std::move(name)) {} - BackReference(const std::string &name) : name_(name) {} + BackReference(const std::string &name) : name_(name) {} - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override; + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override; - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::string name_; + std::string name_; }; class PrecedenceClimbing : public Ope { public: - using BinOpeInfo = std::map>; + using BinOpeInfo = std::map>; - PrecedenceClimbing(const std::shared_ptr &atom, - const std::shared_ptr &binop, const BinOpeInfo &info, - const Definition &rule) - : atom_(atom), binop_(binop), info_(info), rule_(rule) {} + PrecedenceClimbing(const std::shared_ptr &atom, + const std::shared_ptr &binop, const BinOpeInfo &info, + const Definition &rule) + : atom_(atom), binop_(binop), info_(info), rule_(rule) {} - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override { - return parse_expression(s, n, vs, c, dt, 0); - } + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override { + return parse_expression(s, n, vs, c, dt, 0); + } - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::shared_ptr atom_; - std::shared_ptr binop_; - BinOpeInfo info_; - const Definition &rule_; + std::shared_ptr atom_; + std::shared_ptr binop_; + BinOpeInfo info_; + const Definition &rule_; private: - size_t parse_expression(const char *s, size_t n, SemanticValues &vs, - Context &c, std::any &dt, size_t min_prec) const; + size_t parse_expression(const char *s, size_t n, SemanticValues &vs, + Context &c, std::any &dt, size_t min_prec) const; - Definition &get_reference_for_binop(Context &c) const; + Definition &get_reference_for_binop(Context &c) const; }; class Recovery : public Ope { public: - Recovery(const std::shared_ptr &ope) : ope_(ope) {} + Recovery(const std::shared_ptr &ope) : ope_(ope) {} - size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, - std::any &dt) const override; + size_t parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, + std::any &dt) const override; - void accept(Visitor &v) override; + void accept(Visitor &v) override; - std::shared_ptr ope_; + std::shared_ptr ope_; }; class Cut : public Ope, public std::enable_shared_from_this { public: - size_t parse_core(const char * /*s*/, size_t /*n*/, SemanticValues & /*vs*/, - Context &c, std::any & /*dt*/) const override { - if (!c.cut_stack.empty()) { c.cut_stack.back() = true; } - return 0; - } + size_t parse_core(const char * /*s*/, size_t /*n*/, SemanticValues & /*vs*/, + Context &c, std::any & /*dt*/) const override { + if (!c.cut_stack.empty()) { c.cut_stack.back() = true; } + return 0; + } - void accept(Visitor &v) override; + void accept(Visitor &v) override; }; /* * Factories */ template std::shared_ptr seq(Args &&...args) { - return std::make_shared(static_cast>(args)...); + return std::make_shared(static_cast>(args)...); } template std::shared_ptr cho(Args &&...args) { - return std::make_shared( - false, static_cast>(args)...); + return std::make_shared( + false, static_cast>(args)...); } template std::shared_ptr cho4label_(Args &&...args) { - return std::make_shared( - true, static_cast>(args)...); + return std::make_shared( + true, static_cast>(args)...); } inline std::shared_ptr zom(const std::shared_ptr &ope) { - return Repetition::zom(ope); + return Repetition::zom(ope); } inline std::shared_ptr oom(const std::shared_ptr &ope) { - return Repetition::oom(ope); + return Repetition::oom(ope); } inline std::shared_ptr opt(const std::shared_ptr &ope) { - return Repetition::opt(ope); + return Repetition::opt(ope); } inline std::shared_ptr rep(const std::shared_ptr &ope, size_t min, size_t max) { - return std::make_shared(ope, min, max); + return std::make_shared(ope, min, max); } inline std::shared_ptr apd(const std::shared_ptr &ope) { - return std::make_shared(ope); + return std::make_shared(ope); } inline std::shared_ptr npd(const std::shared_ptr &ope) { - return std::make_shared(ope); + return std::make_shared(ope); } inline std::shared_ptr dic(const std::vector &v, bool ignore_case) { - return std::make_shared(v, ignore_case); + return std::make_shared(v, ignore_case); } inline std::shared_ptr lit(std::string &&s) { - return std::make_shared(s, false); + return std::make_shared(s, false); } inline std::shared_ptr liti(std::string &&s) { - return std::make_shared(s, true); + return std::make_shared(s, true); } inline std::shared_ptr cls(const std::string &s) { - return std::make_shared(s, false, false); + return std::make_shared(s, false, false); } inline std::shared_ptr cls(const std::vector> &ranges, bool ignore_case = false) { - return std::make_shared(ranges, false, ignore_case); + return std::make_shared(ranges, false, ignore_case); } inline std::shared_ptr ncls(const std::string &s) { - return std::make_shared(s, true, false); + return std::make_shared(s, true, false); } inline std::shared_ptr ncls(const std::vector> &ranges, bool ignore_case = false) { - return std::make_shared(ranges, true, ignore_case); + return std::make_shared(ranges, true, ignore_case); } -inline std::shared_ptr chr(char dt) { - return std::make_shared(dt); +inline std::shared_ptr chr(char32_t dt) { + return std::make_shared(dt); } inline std::shared_ptr dot() { return std::make_shared(); } inline std::shared_ptr csc(const std::shared_ptr &ope) { - return std::make_shared(ope); + return std::make_shared(ope); } inline std::shared_ptr cap(const std::shared_ptr &ope, Capture::MatchAction ma) { - return std::make_shared(ope, ma); + return std::make_shared(ope, ma); } inline std::shared_ptr tok(const std::shared_ptr &ope) { - return std::make_shared(ope); + return std::make_shared(ope); } inline std::shared_ptr ign(const std::shared_ptr &ope) { - return std::make_shared(ope); + return std::make_shared(ope); } inline std::shared_ptr usr(std::function fn) { - return std::make_shared(fn); + return std::make_shared(fn); } inline std::shared_ptr ref(const Grammar &grammar, const std::string &name, const char *s, bool is_macro, const std::vector> &args) { - return std::make_shared(grammar, name, s, is_macro, args); + return std::make_shared(grammar, name, s, is_macro, args); } inline std::shared_ptr wsp(const std::shared_ptr &ope) { - return std::make_shared(std::make_shared(ope)); + return std::make_shared(std::make_shared(ope)); } inline std::shared_ptr bkr(std::string &&name) { - return std::make_shared(name); + return std::make_shared(name); } inline std::shared_ptr pre(const std::shared_ptr &atom, const std::shared_ptr &binop, const PrecedenceClimbing::BinOpeInfo &info, const Definition &rule) { - return std::make_shared(atom, binop, info, rule); + return std::make_shared(atom, binop, info, rule); } inline std::shared_ptr rec(const std::shared_ptr &ope) { - return std::make_shared(ope); + return std::make_shared(ope); } inline std::shared_ptr cut() { return std::make_shared(); } @@ -1686,511 +1966,600 @@ inline std::shared_ptr cut() { return std::make_shared(); } * Visitor */ struct Ope::Visitor { - virtual ~Visitor() {} - virtual void visit(Sequence &) {} - virtual void visit(PrioritizedChoice &) {} - virtual void visit(Repetition &) {} - virtual void visit(AndPredicate &) {} - virtual void visit(NotPredicate &) {} - virtual void visit(Dictionary &) {} - virtual void visit(LiteralString &) {} - virtual void visit(CharacterClass &) {} - virtual void visit(Character &) {} - virtual void visit(AnyCharacter &) {} - virtual void visit(CaptureScope &) {} - virtual void visit(Capture &) {} - virtual void visit(TokenBoundary &) {} - virtual void visit(Ignore &) {} - virtual void visit(User &) {} - virtual void visit(WeakHolder &) {} - virtual void visit(Holder &) {} - virtual void visit(Reference &) {} - virtual void visit(Whitespace &) {} - virtual void visit(BackReference &) {} - virtual void visit(PrecedenceClimbing &) {} - virtual void visit(Recovery &) {} - virtual void visit(Cut &) {} + virtual ~Visitor() {} + virtual void visit(Sequence &) {} + virtual void visit(PrioritizedChoice &) {} + virtual void visit(Repetition &) {} + virtual void visit(AndPredicate &) {} + virtual void visit(NotPredicate &) {} + virtual void visit(Dictionary &) {} + virtual void visit(LiteralString &) {} + virtual void visit(CharacterClass &) {} + virtual void visit(Character &) {} + virtual void visit(AnyCharacter &) {} + virtual void visit(CaptureScope &) {} + virtual void visit(Capture &) {} + virtual void visit(TokenBoundary &) {} + virtual void visit(Ignore &) {} + virtual void visit(User &) {} + virtual void visit(WeakHolder &) {} + virtual void visit(Holder &) {} + virtual void visit(Reference &) {} + virtual void visit(Whitespace &) {} + virtual void visit(BackReference &) {} + virtual void visit(PrecedenceClimbing &) {} + virtual void visit(Recovery &) {} + virtual void visit(Cut &) {} +}; + +struct TraversalVisitor : public Ope::Visitor { + using Ope::Visitor::visit; + void visit(Sequence &ope) override { + for (auto &op : ope.opes_) { + op->accept(*this); + } + } + void visit(PrioritizedChoice &ope) override { + for (auto &op : ope.opes_) { + op->accept(*this); + } + } + void visit(Repetition &ope) override { ope.ope_->accept(*this); } + void visit(AndPredicate &ope) override { ope.ope_->accept(*this); } + void visit(NotPredicate &ope) override { ope.ope_->accept(*this); } + void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } + void visit(Capture &ope) override { ope.ope_->accept(*this); } + void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } + void visit(Ignore &ope) override { ope.ope_->accept(*this); } + void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } + void visit(Holder &ope) override { ope.ope_->accept(*this); } + void visit(Whitespace &ope) override { ope.ope_->accept(*this); } + void visit(Recovery &ope) override { ope.ope_->accept(*this); } + void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } }; struct TraceOpeName : public Ope::Visitor { - using Ope::Visitor::visit; + using Ope::Visitor::visit; - void visit(Sequence &) override { name_ = "Sequence"; } - void visit(PrioritizedChoice &) override { name_ = "PrioritizedChoice"; } - void visit(Repetition &) override { name_ = "Repetition"; } - void visit(AndPredicate &) override { name_ = "AndPredicate"; } - void visit(NotPredicate &) override { name_ = "NotPredicate"; } - void visit(Dictionary &) override { name_ = "Dictionary"; } - void visit(LiteralString &) override { name_ = "LiteralString"; } - void visit(CharacterClass &) override { name_ = "CharacterClass"; } - void visit(Character &) override { name_ = "Character"; } - void visit(AnyCharacter &) override { name_ = "AnyCharacter"; } - void visit(CaptureScope &) override { name_ = "CaptureScope"; } - void visit(Capture &) override { name_ = "Capture"; } - void visit(TokenBoundary &) override { name_ = "TokenBoundary"; } - void visit(Ignore &) override { name_ = "Ignore"; } - void visit(User &) override { name_ = "User"; } - void visit(WeakHolder &) override { name_ = "WeakHolder"; } - void visit(Holder &ope) override { name_ = ope.trace_name().data(); } - void visit(Reference &) override { name_ = "Reference"; } - void visit(Whitespace &) override { name_ = "Whitespace"; } - void visit(BackReference &) override { name_ = "BackReference"; } - void visit(PrecedenceClimbing &) override { name_ = "PrecedenceClimbing"; } - void visit(Recovery &) override { name_ = "Recovery"; } - void visit(Cut &) override { name_ = "Cut"; } + void visit(Sequence &) override { name_ = "Sequence"; } + void visit(PrioritizedChoice &) override { name_ = "PrioritizedChoice"; } + void visit(Repetition &) override { name_ = "Repetition"; } + void visit(AndPredicate &) override { name_ = "AndPredicate"; } + void visit(NotPredicate &) override { name_ = "NotPredicate"; } + void visit(Dictionary &) override { name_ = "Dictionary"; } + void visit(LiteralString &) override { name_ = "LiteralString"; } + void visit(CharacterClass &) override { name_ = "CharacterClass"; } + void visit(Character &) override { name_ = "Character"; } + void visit(AnyCharacter &) override { name_ = "AnyCharacter"; } + void visit(CaptureScope &) override { name_ = "CaptureScope"; } + void visit(Capture &) override { name_ = "Capture"; } + void visit(TokenBoundary &) override { name_ = "TokenBoundary"; } + void visit(Ignore &) override { name_ = "Ignore"; } + void visit(User &) override { name_ = "User"; } + void visit(WeakHolder &) override { name_ = "WeakHolder"; } + void visit(Holder &ope) override { name_ = ope.trace_name().data(); } + void visit(Reference &) override { name_ = "Reference"; } + void visit(Whitespace &) override { name_ = "Whitespace"; } + void visit(BackReference &) override { name_ = "BackReference"; } + void visit(PrecedenceClimbing &) override { name_ = "PrecedenceClimbing"; } + void visit(Recovery &) override { name_ = "Recovery"; } + void visit(Cut &) override { name_ = "Cut"; } - static std::string get(Ope &ope) { - TraceOpeName vis; - ope.accept(vis); - return vis.name_; - } + static std::string get(Ope &ope) { + TraceOpeName vis; + ope.accept(vis); + return vis.name_; + } private: - const char *name_ = nullptr; + const char *name_ = nullptr; }; -struct AssignIDToDefinition : public Ope::Visitor { - using Ope::Visitor::visit; +struct AssignIDToDefinition : public TraversalVisitor { + using TraversalVisitor::visit; - void visit(Sequence &ope) override { - for (auto op : ope.opes_) { - op->accept(*this); - } - } - void visit(PrioritizedChoice &ope) override { - for (auto op : ope.opes_) { - op->accept(*this); - } - } - void visit(Repetition &ope) override { ope.ope_->accept(*this); } - void visit(AndPredicate &ope) override { ope.ope_->accept(*this); } - void visit(NotPredicate &ope) override { ope.ope_->accept(*this); } - void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } - void visit(Capture &ope) override { ope.ope_->accept(*this); } - void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } - void visit(Ignore &ope) override { ope.ope_->accept(*this); } - void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } - void visit(Holder &ope) override; - void visit(Reference &ope) override; - void visit(Whitespace &ope) override { ope.ope_->accept(*this); } - void visit(PrecedenceClimbing &ope) override; - void visit(Recovery &ope) override { ope.ope_->accept(*this); } + void visit(Holder &ope) override; + void visit(Reference &ope) override; + void visit(PrecedenceClimbing &ope) override; - std::unordered_map ids; + std::unordered_map ids; }; struct IsLiteralToken : public Ope::Visitor { - using Ope::Visitor::visit; + using Ope::Visitor::visit; - void visit(PrioritizedChoice &ope) override { - for (auto op : ope.opes_) { - if (!IsLiteralToken::check(*op)) { return; } - } - result_ = true; + void visit(PrioritizedChoice &ope) override { + for (const auto &op : ope.opes_) { + if (!IsLiteralToken::check(*op)) { return; } } + result_ = true; + } - void visit(Dictionary &) override { result_ = true; } - void visit(LiteralString &) override { result_ = true; } + void visit(Dictionary &) override { result_ = true; } + void visit(LiteralString &) override { result_ = true; } - static bool check(Ope &ope) { - IsLiteralToken vis; - ope.accept(vis); - return vis.result_; - } + static bool check(Ope &ope) { + IsLiteralToken vis; + ope.accept(vis); + return vis.result_; + } private: - bool result_ = false; + bool result_ = false; }; -struct TokenChecker : public Ope::Visitor { - using Ope::Visitor::visit; +struct TokenChecker : public TraversalVisitor { + using TraversalVisitor::visit; - void visit(Sequence &ope) override { - for (auto op : ope.opes_) { - op->accept(*this); - } - } - void visit(PrioritizedChoice &ope) override { - for (auto op : ope.opes_) { - op->accept(*this); - } - } - void visit(Repetition &ope) override { ope.ope_->accept(*this); } - void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } - void visit(Capture &ope) override { ope.ope_->accept(*this); } - void visit(TokenBoundary &) override { has_token_boundary_ = true; } - void visit(Ignore &ope) override { ope.ope_->accept(*this); } - void visit(WeakHolder &) override { has_rule_ = true; } - void visit(Holder &ope) override { ope.ope_->accept(*this); } - void visit(Reference &ope) override; - void visit(Whitespace &ope) override { ope.ope_->accept(*this); } - void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } - void visit(Recovery &ope) override { ope.ope_->accept(*this); } + void visit(TokenBoundary &) override { has_token_boundary_ = true; } + void visit(AndPredicate &) override {} + void visit(NotPredicate &) override {} + void visit(WeakHolder &) override { has_rule_ = true; } + void visit(Reference &ope) override; - static bool is_token(Ope &ope) { - if (IsLiteralToken::check(ope)) { return true; } + static bool is_token(Ope &ope) { + if (IsLiteralToken::check(ope)) { return true; } - TokenChecker vis; - ope.accept(vis); - return vis.has_token_boundary_ || !vis.has_rule_; - } + TokenChecker vis; + ope.accept(vis); + return vis.has_token_boundary_ || !vis.has_rule_; + } private: - bool has_token_boundary_ = false; - bool has_rule_ = false; + bool has_token_boundary_ = false; + bool has_rule_ = false; }; struct FindLiteralToken : public Ope::Visitor { - using Ope::Visitor::visit; + using Ope::Visitor::visit; - void visit(LiteralString &ope) override { token_ = ope.lit_.data(); } - void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } - void visit(Ignore &ope) override { ope.ope_->accept(*this); } - void visit(Reference &ope) override; - void visit(Recovery &ope) override { ope.ope_->accept(*this); } + void visit(LiteralString &ope) override { token_ = ope.lit_.data(); } + void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } + void visit(Ignore &ope) override { ope.ope_->accept(*this); } + void visit(Reference &ope) override; + void visit(Recovery &ope) override { ope.ope_->accept(*this); } - static const char *token(Ope &ope) { - FindLiteralToken vis; - ope.accept(vis); - return vis.token_; - } + static const char *token(Ope &ope) { + FindLiteralToken vis; + ope.accept(vis); + return vis.token_; + } private: - const char *token_ = nullptr; + const char *token_ = nullptr; }; -struct DetectLeftRecursion : public Ope::Visitor { - using Ope::Visitor::visit; +struct DetectLeftRecursion : public TraversalVisitor { + using TraversalVisitor::visit; - DetectLeftRecursion(const std::string &name) : name_(name) {} + DetectLeftRecursion(const std::string &name) : name_(name) {} - void visit(Sequence &ope) override { - for (auto op : ope.opes_) { - op->accept(*this); - if (done_) { - break; - } else if (error_s) { - done_ = true; - break; - } - } + void visit(Sequence &ope) override { + for (const auto &op : ope.opes_) { + op->accept(*this); + if (done_) { + break; + } else if (error_s) { + done_ = true; + break; + } } - void visit(PrioritizedChoice &ope) override { - for (auto op : ope.opes_) { - op->accept(*this); - if (error_s) { - done_ = true; - break; - } - } + } + void visit(PrioritizedChoice &ope) override { + for (const auto &op : ope.opes_) { + op->accept(*this); + if (error_s) { + done_ = true; + break; + } } - void visit(Repetition &ope) override { - ope.ope_->accept(*this); - done_ = ope.min_ > 0; - } - void visit(AndPredicate &ope) override { - ope.ope_->accept(*this); - done_ = false; - } - void visit(NotPredicate &ope) override { - ope.ope_->accept(*this); - done_ = false; - } - void visit(Dictionary &) override { done_ = true; } - void visit(LiteralString &ope) override { done_ = !ope.lit_.empty(); } - void visit(CharacterClass &) override { done_ = true; } - void visit(Character &) override { done_ = true; } - void visit(AnyCharacter &) override { done_ = true; } - void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } - void visit(Capture &ope) override { ope.ope_->accept(*this); } - void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } - void visit(Ignore &ope) override { ope.ope_->accept(*this); } - void visit(User &) override { done_ = true; } - void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } - void visit(Holder &ope) override { ope.ope_->accept(*this); } - void visit(Reference &ope) override; - void visit(Whitespace &ope) override { ope.ope_->accept(*this); } - void visit(BackReference &) override { done_ = true; } - void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } - void visit(Recovery &ope) override { ope.ope_->accept(*this); } - void visit(Cut &) override { done_ = true; } + } + void visit(Repetition &ope) override { + ope.ope_->accept(*this); + done_ = ope.min_ > 0; + } + void visit(AndPredicate &ope) override { + ope.ope_->accept(*this); + done_ = false; + } + void visit(NotPredicate &ope) override { + ope.ope_->accept(*this); + done_ = false; + } + void visit(Dictionary &) override { done_ = true; } + void visit(LiteralString &ope) override { done_ = !ope.lit_.empty(); } + void visit(CharacterClass &) override { done_ = true; } + void visit(Character &) override { done_ = true; } + void visit(AnyCharacter &) override { done_ = true; } + void visit(User &) override { done_ = true; } + void visit(Reference &ope) override; + void visit(BackReference &) override { done_ = true; } + void visit(Cut &) override { done_ = true; } - const char *error_s = nullptr; + const char *error_s = nullptr; + + std::shared_ptr resolve_macro_arg(size_t iarg) const; private: - std::string name_; - std::unordered_set refs_; - bool done_ = false; + std::string name_; + std::unordered_set refs_; + bool done_ = false; + std::vector> *> macro_args_stack_; }; -struct HasEmptyElement : public Ope::Visitor { - using Ope::Visitor::visit; +struct ComputeCanBeEmpty : public TraversalVisitor { + using TraversalVisitor::visit; - HasEmptyElement(std::vector> &refs, - std::unordered_map &has_error_cache) - : refs_(refs), has_error_cache_(has_error_cache) {} + bool result = false; - void visit(Sequence &ope) override; - void visit(PrioritizedChoice &ope) override { - for (auto op : ope.opes_) { - op->accept(*this); - if (is_empty) { return; } - } - } - void visit(Repetition &ope) override { - if (ope.min_ == 0) { - set_error(); - } else { - ope.ope_->accept(*this); - } - } - void visit(AndPredicate &) override { set_error(); } - void visit(NotPredicate &) override { set_error(); } - void visit(LiteralString &ope) override { - if (ope.lit_.empty()) { set_error(); } - } - void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } - void visit(Capture &ope) override { ope.ope_->accept(*this); } - void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } - void visit(Ignore &ope) override { ope.ope_->accept(*this); } - void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } - void visit(Holder &ope) override { ope.ope_->accept(*this); } - void visit(Reference &ope) override; - void visit(Whitespace &ope) override { ope.ope_->accept(*this); } - void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } - void visit(Recovery &ope) override { ope.ope_->accept(*this); } - - bool is_empty = false; - const char *error_s = nullptr; - std::string error_name; - -private: - void set_error() { - is_empty = true; - tie(error_s, error_name) = refs_.back(); - } - std::vector> &refs_; - std::unordered_map &has_error_cache_; + void visit(Sequence &ope) override { + result = std::all_of(ope.opes_.begin(), ope.opes_.end(), [](auto &op) { + ComputeCanBeEmpty vis; + op->accept(vis); + return vis.result; + }); + } + void visit(PrioritizedChoice &ope) override { + result = std::any_of(ope.opes_.begin(), ope.opes_.end(), [](auto &op) { + ComputeCanBeEmpty vis; + op->accept(vis); + return vis.result; + }); + } + void visit(Repetition &ope) override { result = ope.min_ == 0; } + void visit(AndPredicate &) override { result = true; } + void visit(NotPredicate &) override { result = true; } + void visit(Dictionary &) override { result = false; } + void visit(LiteralString &ope) override { result = ope.lit_.empty(); } + void visit(CharacterClass &) override { result = false; } + void visit(Character &) override { result = false; } + void visit(AnyCharacter &) override { result = false; } + void visit(User &) override { result = false; } + void visit(Reference &ope) override; + void visit(BackReference &) override { result = false; } + void visit(Cut &) override { result = false; } }; -struct DetectInfiniteLoop : public Ope::Visitor { - using Ope::Visitor::visit; +struct HasEmptyElement : public TraversalVisitor { + using TraversalVisitor::visit; - DetectInfiniteLoop(const char *s, const std::string &name, - std::vector> &refs, - std::unordered_map &has_error_cache) - : refs_(refs), has_error_cache_(has_error_cache) { - refs_.emplace_back(s, name); - } + HasEmptyElement(std::vector> &refs, + std::unordered_map &has_error_cache) + : refs_(refs), has_error_cache_(has_error_cache) {} - DetectInfiniteLoop(std::vector> &refs, - std::unordered_map &has_error_cache) - : refs_(refs), has_error_cache_(has_error_cache) {} + void visit(Sequence &ope) override; + void visit(PrioritizedChoice &ope) override { + for (const auto &op : ope.opes_) { + op->accept(*this); + if (is_empty) { return; } + } + } + void visit(Repetition &ope) override { + if (ope.min_ == 0) { + set_error(); + } else { + ope.ope_->accept(*this); + } + } + void visit(AndPredicate &) override { set_error(); } + void visit(NotPredicate &) override { set_error(); } + void visit(LiteralString &ope) override { + if (ope.lit_.empty()) { set_error(); } + } + void visit(Reference &ope) override; - void visit(Sequence &ope) override { - for (auto op : ope.opes_) { - op->accept(*this); - if (has_error) { return; } - } - } - void visit(PrioritizedChoice &ope) override { - for (auto op : ope.opes_) { - op->accept(*this); - if (has_error) { return; } - } - } - void visit(Repetition &ope) override { - if (ope.max_ == std::numeric_limits::max()) { - HasEmptyElement vis(refs_, has_error_cache_); - ope.ope_->accept(vis); - if (vis.is_empty) { - has_error = true; - error_s = vis.error_s; - error_name = vis.error_name; - } - } else { - ope.ope_->accept(*this); - } - } - void visit(AndPredicate &ope) override { ope.ope_->accept(*this); } - void visit(NotPredicate &ope) override { ope.ope_->accept(*this); } - void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } - void visit(Capture &ope) override { ope.ope_->accept(*this); } - void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } - void visit(Ignore &ope) override { ope.ope_->accept(*this); } - void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } - void visit(Holder &ope) override { ope.ope_->accept(*this); } - void visit(Reference &ope) override; - void visit(Whitespace &ope) override { ope.ope_->accept(*this); } - void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } - void visit(Recovery &ope) override { ope.ope_->accept(*this); } - - bool has_error = false; - const char *error_s = nullptr; - std::string error_name; + bool is_empty = false; + const char *error_s = nullptr; + std::string error_name; private: - std::vector> &refs_; - std::unordered_map &has_error_cache_; + void set_error() { + is_empty = true; + tie(error_s, error_name) = refs_.back(); + } + std::vector> &refs_; + std::unordered_map &has_error_cache_; }; -struct ReferenceChecker : public Ope::Visitor { - using Ope::Visitor::visit; +struct DetectInfiniteLoop : public TraversalVisitor { + using TraversalVisitor::visit; - ReferenceChecker(const Grammar &grammar, - const std::vector ¶ms) - : grammar_(grammar), params_(params) {} + DetectInfiniteLoop(const char *s, const std::string &name, + std::vector> &refs, + std::unordered_map &has_error_cache) + : refs_(refs), has_error_cache_(has_error_cache) { + refs_.emplace_back(s, name); + } - void visit(Sequence &ope) override { - for (auto op : ope.opes_) { - op->accept(*this); - } + DetectInfiniteLoop(std::vector> &refs, + std::unordered_map &has_error_cache) + : refs_(refs), has_error_cache_(has_error_cache) {} + + void visit(Sequence &ope) override { + for (const auto &op : ope.opes_) { + op->accept(*this); + if (has_error) { return; } } - void visit(PrioritizedChoice &ope) override { - for (auto op : ope.opes_) { - op->accept(*this); - } + } + void visit(PrioritizedChoice &ope) override { + for (const auto &op : ope.opes_) { + op->accept(*this); + if (has_error) { return; } } - void visit(Repetition &ope) override { ope.ope_->accept(*this); } - void visit(AndPredicate &ope) override { ope.ope_->accept(*this); } - void visit(NotPredicate &ope) override { ope.ope_->accept(*this); } - void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } - void visit(Capture &ope) override { ope.ope_->accept(*this); } - void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } - void visit(Ignore &ope) override { ope.ope_->accept(*this); } - void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } - void visit(Holder &ope) override { ope.ope_->accept(*this); } - void visit(Reference &ope) override; - void visit(Whitespace &ope) override { ope.ope_->accept(*this); } - void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } - void visit(Recovery &ope) override { ope.ope_->accept(*this); } + } + void visit(Repetition &ope) override { + if (ope.max_ == std::numeric_limits::max()) { + HasEmptyElement vis(refs_, has_error_cache_); + ope.ope_->accept(vis); + if (vis.is_empty) { + has_error = true; + error_s = vis.error_s; + error_name = vis.error_name; + } + } else { + ope.ope_->accept(*this); + } + } + void visit(Reference &ope) override; - std::unordered_map error_s; - std::unordered_map error_message; - std::unordered_set referenced; + bool has_error = false; + const char *error_s = nullptr; + std::string error_name; private: - const Grammar &grammar_; - const std::vector ¶ms_; + std::vector> &refs_; + std::unordered_map &has_error_cache_; }; -struct LinkReferences : public Ope::Visitor { - using Ope::Visitor::visit; +struct ReferenceChecker : public TraversalVisitor { + using TraversalVisitor::visit; - LinkReferences(Grammar &grammar, const std::vector ¶ms) - : grammar_(grammar), params_(params) {} + ReferenceChecker(const Grammar &grammar, + const std::vector ¶ms) + : grammar_(grammar), params_(params) {} - void visit(Sequence &ope) override { - for (auto op : ope.opes_) { - op->accept(*this); - } - } - void visit(PrioritizedChoice &ope) override { - for (auto op : ope.opes_) { - op->accept(*this); - } - } - void visit(Repetition &ope) override { ope.ope_->accept(*this); } - void visit(AndPredicate &ope) override { ope.ope_->accept(*this); } - void visit(NotPredicate &ope) override { ope.ope_->accept(*this); } - void visit(CaptureScope &ope) override { ope.ope_->accept(*this); } - void visit(Capture &ope) override { ope.ope_->accept(*this); } - void visit(TokenBoundary &ope) override { ope.ope_->accept(*this); } - void visit(Ignore &ope) override { ope.ope_->accept(*this); } - void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } - void visit(Holder &ope) override { ope.ope_->accept(*this); } - void visit(Reference &ope) override; - void visit(Whitespace &ope) override { ope.ope_->accept(*this); } - void visit(PrecedenceClimbing &ope) override { ope.atom_->accept(*this); } - void visit(Recovery &ope) override { ope.ope_->accept(*this); } + void visit(Reference &ope) override; + + std::unordered_map error_s; + std::unordered_map error_message; + std::unordered_set referenced; private: - Grammar &grammar_; - const std::vector ¶ms_; + const Grammar &grammar_; + const std::vector ¶ms_; +}; + +struct LinkReferences : public TraversalVisitor { + using TraversalVisitor::visit; + + LinkReferences(Grammar &grammar, const std::vector ¶ms) + : grammar_(grammar), params_(params) {} + + void visit(Reference &ope) override; + +private: + Grammar &grammar_; + const std::vector ¶ms_; }; struct FindReference : public Ope::Visitor { - using Ope::Visitor::visit; + using Ope::Visitor::visit; - FindReference(const std::vector> &args, - const std::vector ¶ms) - : args_(args), params_(params) {} + FindReference(const std::vector> &args, + const std::vector ¶ms) + : args_(args), params_(params) {} - void visit(Sequence &ope) override { - std::vector> opes; - for (auto o : ope.opes_) { - o->accept(*this); - opes.push_back(found_ope); - } - found_ope = std::make_shared(opes); + void visit(Sequence &ope) override { + std::vector> opes; + for (const auto &o : ope.opes_) { + o->accept(*this); + opes.emplace_back(std::move(found_ope)); } - void visit(PrioritizedChoice &ope) override { - std::vector> opes; - for (auto o : ope.opes_) { - o->accept(*this); - opes.push_back(found_ope); - } - found_ope = std::make_shared(opes); + found_ope = std::make_shared(opes); + } + void visit(PrioritizedChoice &ope) override { + std::vector> opes; + for (const auto &o : ope.opes_) { + o->accept(*this); + opes.emplace_back(std::move(found_ope)); } - void visit(Repetition &ope) override { - ope.ope_->accept(*this); - found_ope = rep(found_ope, ope.min_, ope.max_); - } - void visit(AndPredicate &ope) override { - ope.ope_->accept(*this); - found_ope = apd(found_ope); - } - void visit(NotPredicate &ope) override { - ope.ope_->accept(*this); - found_ope = npd(found_ope); - } - void visit(Dictionary &ope) override { found_ope = ope.shared_from_this(); } - void visit(LiteralString &ope) override { - found_ope = ope.shared_from_this(); - } - void visit(CharacterClass &ope) override { - found_ope = ope.shared_from_this(); - } - void visit(Character &ope) override { found_ope = ope.shared_from_this(); } - void visit(AnyCharacter &ope) override { found_ope = ope.shared_from_this(); } - void visit(CaptureScope &ope) override { - ope.ope_->accept(*this); - found_ope = csc(found_ope); - } - void visit(Capture &ope) override { - ope.ope_->accept(*this); - found_ope = cap(found_ope, ope.match_action_); - } - void visit(TokenBoundary &ope) override { - ope.ope_->accept(*this); - found_ope = tok(found_ope); - } - void visit(Ignore &ope) override { - ope.ope_->accept(*this); - found_ope = ign(found_ope); - } - void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } - void visit(Holder &ope) override { ope.ope_->accept(*this); } - void visit(Reference &ope) override; - void visit(Whitespace &ope) override { - ope.ope_->accept(*this); - found_ope = wsp(found_ope); - } - void visit(PrecedenceClimbing &ope) override { - ope.atom_->accept(*this); - found_ope = csc(found_ope); - } - void visit(Recovery &ope) override { - ope.ope_->accept(*this); - found_ope = rec(found_ope); - } - void visit(Cut &ope) override { found_ope = ope.shared_from_this(); } + found_ope = std::make_shared(opes); + } + void visit(Repetition &ope) override { + ope.ope_->accept(*this); + found_ope = rep(found_ope, ope.min_, ope.max_); + } + void visit(AndPredicate &ope) override { + ope.ope_->accept(*this); + found_ope = apd(found_ope); + } + void visit(NotPredicate &ope) override { + ope.ope_->accept(*this); + found_ope = npd(found_ope); + } + void visit(Dictionary &ope) override { found_ope = ope.shared_from_this(); } + void visit(LiteralString &ope) override { + found_ope = ope.shared_from_this(); + } + void visit(CharacterClass &ope) override { + found_ope = ope.shared_from_this(); + } + void visit(Character &ope) override { found_ope = ope.shared_from_this(); } + void visit(AnyCharacter &ope) override { found_ope = ope.shared_from_this(); } + void visit(CaptureScope &ope) override { + ope.ope_->accept(*this); + found_ope = csc(found_ope); + } + void visit(Capture &ope) override { + ope.ope_->accept(*this); + found_ope = cap(found_ope, ope.match_action_); + } + void visit(TokenBoundary &ope) override { + ope.ope_->accept(*this); + found_ope = tok(found_ope); + } + void visit(Ignore &ope) override { + ope.ope_->accept(*this); + found_ope = ign(found_ope); + } + void visit(WeakHolder &ope) override { ope.weak_.lock()->accept(*this); } + void visit(Holder &ope) override { ope.ope_->accept(*this); } + void visit(Reference &ope) override; + void visit(Whitespace &ope) override { + ope.ope_->accept(*this); + found_ope = wsp(found_ope); + } + void visit(PrecedenceClimbing &ope) override { + ope.atom_->accept(*this); + found_ope = csc(found_ope); + } + void visit(Recovery &ope) override { + ope.ope_->accept(*this); + found_ope = rec(found_ope); + } + void visit(Cut &ope) override { found_ope = ope.shared_from_this(); } - std::shared_ptr found_ope; + std::shared_ptr found_ope; private: - const std::vector> &args_; - const std::vector ¶ms_; + const std::vector> &args_; + const std::vector ¶ms_; +}; + +/* + * First-Set computation + */ +struct ComputeFirstSet : public TraversalVisitor { + using TraversalVisitor::visit; + + void visit(Sequence &ope) override { + for (const auto &op : ope.opes_) { + auto save = result_; + result_ = FirstSet{}; + op->accept(*this); + auto element_fs = result_; + result_ = save; + result_.chars |= element_fs.chars; + if (element_fs.any_char) { result_.any_char = true; } + if (!result_.first_literal) { + result_.first_literal = element_fs.first_literal; + } + if (!result_.first_rule) { result_.first_rule = element_fs.first_rule; } + if (!element_fs.can_be_empty) { return; } + // This element can be empty, continue to next + } + result_.can_be_empty = true; + } + void visit(PrioritizedChoice &ope) override { + auto save = result_; + for (const auto &op : ope.opes_) { + result_ = FirstSet{}; + op->accept(*this); + save.merge(result_); + } + result_ = save; + } + void visit(Repetition &ope) override { + ope.ope_->accept(*this); + if (ope.min_ == 0) { result_.can_be_empty = true; } + } + void visit(AndPredicate &) override { result_.can_be_empty = true; } + void visit(NotPredicate &) override { result_.can_be_empty = true; } + void visit(Dictionary &ope) override { + for (const auto &[key, info] : ope.trie_.dic_) { + if (!key.empty()) { + auto ch = static_cast(key[0]); + result_.chars.set(ch); + if (ope.trie_.ignore_case_) { + result_.chars.set(static_cast(std::toupper(ch))); + result_.chars.set(static_cast(std::tolower(ch))); + } + } + } + } + void visit(LiteralString &ope) override { + if (ope.lit_.empty()) { + result_.can_be_empty = true; + } else { + auto ch = static_cast(ope.lit_[0]); + result_.chars.set(ch); + if (ope.ignore_case_) { + result_.chars.set(static_cast(std::toupper(ch))); + result_.chars.set(static_cast(std::tolower(ch))); + } + if (!result_.first_literal) { result_.first_literal = ope.lit_.c_str(); } + } + } + void visit(CharacterClass &ope) override { + for (const auto &range : ope.ranges_) { + auto cp1 = range.first; + auto cp2 = range.second; + if (cp1 > 0x7F || cp2 > 0x7F) { + // Non-ASCII range: conservative fallback + result_.any_char = true; + return; + } + for (auto cp = cp1; cp <= cp2; cp++) { + auto ch = static_cast(cp); + result_.chars.set(ch); + if (ope.ignore_case_) { + result_.chars.set(static_cast(std::toupper(ch))); + result_.chars.set(static_cast(std::tolower(ch))); + } + } + } + if (ope.negated_) { + result_.chars.flip(); + result_.any_char = true; // negated class can match non-ASCII + } + } + void visit(Character &ope) override { + if (ope.ch_ > 0x7F) { + result_.any_char = true; + } else { + result_.chars.set(static_cast(ope.ch_)); + } + } + void visit(AnyCharacter &) override { result_.any_char = true; } + void visit(User &) override { result_.any_char = true; } + void visit(Reference &ope) override; + void visit(BackReference &) override { result_.any_char = true; } + void visit(Cut &) override { result_.can_be_empty = true; } + + FirstSet result_; + +private: + std::unordered_set refs_; +}; + +struct SetupFirstSets : public TraversalVisitor { + using TraversalVisitor::visit; + + void visit(Sequence &ope) override; + void setup_keyword_guarded_identifier(Sequence &ope); + + void visit(PrioritizedChoice &ope) override { + ope.first_sets_.clear(); + ope.first_sets_.reserve(ope.opes_.size()); + for (const auto &op : ope.opes_) { + ComputeFirstSet cfs; + op->accept(cfs); + ope.first_sets_.push_back(cfs.result_); + } + for (const auto &op : ope.opes_) { + op->accept(*this); + } + } + void visit(Repetition &ope) override { + ope.ope_->accept(*this); + // ISpan optimization: detect Repetition + ASCII CharacterClass + auto cc = dynamic_cast(ope.ope_.get()); + if (cc && cc->is_ascii_only()) { ope.span_bitset_ = &cc->ascii_bitset(); } + } + void visit(Reference &ope) override; + +private: + std::unordered_set refs_; }; /* @@ -2205,279 +2574,299 @@ static const char *RECOVER_DEFINITION_NAME = "%recover"; */ class Definition { public: - struct Result { - bool ret; - bool recovered; - size_t len; - ErrorInfo error_info; - }; + struct Result { + bool ret; + bool recovered; + size_t len; + ErrorInfo error_info; + }; - Definition() : holder_(std::make_shared(this)) {} + Definition() : holder_(std::make_shared(this)) {} - Definition(const Definition &rhs) : name(rhs.name), holder_(rhs.holder_) { - holder_->outer_ = this; + Definition(const Definition &rhs) : name(rhs.name), holder_(rhs.holder_) { + holder_->outer_ = this; + } + + Definition(const std::shared_ptr &ope) + : holder_(std::make_shared(this)) { + *this <= ope; + } + + operator std::shared_ptr() { + return std::make_shared(holder_); + } + + Definition &operator<=(const std::shared_ptr &ope) { + holder_->ope_ = ope; + return *this; + } + + Result parse(const char *s, size_t n, const char *path = nullptr, + Log log = nullptr) const { + SemanticValues vs; + std::any dt; + return parse_core(s, n, vs, dt, path, log); + } + + Result parse(const char *s, const char *path = nullptr, + Log log = nullptr) const { + auto n = strlen(s); + return parse(s, n, path, log); + } + + Result parse(const char *s, size_t n, std::any &dt, + const char *path = nullptr, Log log = nullptr) const { + SemanticValues vs; + return parse_core(s, n, vs, dt, path, log); + } + + Result parse(const char *s, std::any &dt, const char *path = nullptr, + Log log = nullptr) const { + auto n = strlen(s); + return parse(s, n, dt, path, log); + } + + template + Result parse_and_get_value(const char *s, size_t n, T &val, + const char *path = nullptr, + Log log = nullptr) const { + SemanticValues vs; + std::any dt; + auto r = parse_core(s, n, vs, dt, path, log); + if (r.ret && !vs.empty() && vs.front().has_value()) { + val = std::any_cast(vs[0]); } + return r; + } - Definition(const std::shared_ptr &ope) - : holder_(std::make_shared(this)) { - *this <= ope; - } + template + Result parse_and_get_value(const char *s, T &val, const char *path = nullptr, + Log log = nullptr) const { + auto n = strlen(s); + return parse_and_get_value(s, n, val, path, log); + } - operator std::shared_ptr() { - return std::make_shared(holder_); + template + Result parse_and_get_value(const char *s, size_t n, std::any &dt, T &val, + const char *path = nullptr, + Log log = nullptr) const { + SemanticValues vs; + auto r = parse_core(s, n, vs, dt, path, log); + if (r.ret && !vs.empty() && vs.front().has_value()) { + val = std::any_cast(vs[0]); } + return r; + } - Definition &operator<=(const std::shared_ptr &ope) { - holder_->ope_ = ope; - return *this; - } - - Result parse(const char *s, size_t n, const char *path = nullptr, - Log log = nullptr) const { - SemanticValues vs; - std::any dt; - return parse_core(s, n, vs, dt, path, log); - } - - Result parse(const char *s, const char *path = nullptr, - Log log = nullptr) const { - auto n = strlen(s); - return parse(s, n, path, log); - } - - Result parse(const char *s, size_t n, std::any &dt, - const char *path = nullptr, Log log = nullptr) const { - SemanticValues vs; - return parse_core(s, n, vs, dt, path, log); - } - - Result parse(const char *s, std::any &dt, const char *path = nullptr, - Log log = nullptr) const { - auto n = strlen(s); - return parse(s, n, dt, path, log); - } - - template - Result parse_and_get_value(const char *s, size_t n, T &val, - const char *path = nullptr, - Log log = nullptr) const { - SemanticValues vs; - std::any dt; - auto r = parse_core(s, n, vs, dt, path, log); - if (r.ret && !vs.empty() && vs.front().has_value()) { - val = std::any_cast(vs[0]); - } - return r; - } - - template - Result parse_and_get_value(const char *s, T &val, const char *path = nullptr, - Log log = nullptr) const { - auto n = strlen(s); - return parse_and_get_value(s, n, val, path, log); - } - - template - Result parse_and_get_value(const char *s, size_t n, std::any &dt, T &val, - const char *path = nullptr, - Log log = nullptr) const { - SemanticValues vs; - auto r = parse_core(s, n, vs, dt, path, log); - if (r.ret && !vs.empty() && vs.front().has_value()) { - val = std::any_cast(vs[0]); - } - return r; - } - - template - Result parse_and_get_value(const char *s, std::any &dt, T &val, - const char *path = nullptr, - Log log = nullptr) const { - auto n = strlen(s); - return parse_and_get_value(s, n, dt, val, path, log); - } + template + Result parse_and_get_value(const char *s, std::any &dt, T &val, + const char *path = nullptr, + Log log = nullptr) const { + auto n = strlen(s); + return parse_and_get_value(s, n, dt, val, path, log); + } #if defined(__cpp_lib_char8_t) - Result parse(const char8_t *s, size_t n, const char *path = nullptr, - Log log = nullptr) const { - return parse(reinterpret_cast(s), n, path, log); - } + Result parse(const char8_t *s, size_t n, const char *path = nullptr, + Log log = nullptr) const { + return parse(reinterpret_cast(s), n, path, log); + } - Result parse(const char8_t *s, const char *path = nullptr, - Log log = nullptr) const { - return parse(reinterpret_cast(s), path, log); - } + Result parse(const char8_t *s, const char *path = nullptr, + Log log = nullptr) const { + return parse(reinterpret_cast(s), path, log); + } - Result parse(const char8_t *s, size_t n, std::any &dt, - const char *path = nullptr, Log log = nullptr) const { - return parse(reinterpret_cast(s), n, dt, path, log); - } + Result parse(const char8_t *s, size_t n, std::any &dt, + const char *path = nullptr, Log log = nullptr) const { + return parse(reinterpret_cast(s), n, dt, path, log); + } - Result parse(const char8_t *s, std::any &dt, const char *path = nullptr, - Log log = nullptr) const { - return parse(reinterpret_cast(s), dt, path, log); - } + Result parse(const char8_t *s, std::any &dt, const char *path = nullptr, + Log log = nullptr) const { + return parse(reinterpret_cast(s), dt, path, log); + } - template - Result parse_and_get_value(const char8_t *s, size_t n, T &val, - const char *path = nullptr, - Log log = nullptr) const { - return parse_and_get_value(reinterpret_cast(s), n, val, *path, - log); - } + template + Result parse_and_get_value(const char8_t *s, size_t n, T &val, + const char *path = nullptr, + Log log = nullptr) const { + return parse_and_get_value(reinterpret_cast(s), n, val, path, + log); + } - template - Result parse_and_get_value(const char8_t *s, T &val, - const char *path = nullptr, - Log log = nullptr) const { - return parse_and_get_value(reinterpret_cast(s), val, *path, - log); - } + template + Result parse_and_get_value(const char8_t *s, T &val, + const char *path = nullptr, + Log log = nullptr) const { + return parse_and_get_value(reinterpret_cast(s), val, path, + log); + } - template - Result parse_and_get_value(const char8_t *s, size_t n, std::any &dt, T &val, - const char *path = nullptr, - Log log = nullptr) const { - return parse_and_get_value(reinterpret_cast(s), n, dt, val, - *path, log); - } + template + Result parse_and_get_value(const char8_t *s, size_t n, std::any &dt, T &val, + const char *path = nullptr, + Log log = nullptr) const { + return parse_and_get_value(reinterpret_cast(s), n, dt, val, + path, log); + } - template - Result parse_and_get_value(const char8_t *s, std::any &dt, T &val, - const char *path = nullptr, - Log log = nullptr) const { - return parse_and_get_value(reinterpret_cast(s), dt, val, - *path, log); - } + template + Result parse_and_get_value(const char8_t *s, std::any &dt, T &val, + const char *path = nullptr, + Log log = nullptr) const { + return parse_and_get_value(reinterpret_cast(s), dt, val, path, + log); + } #endif - void operator=(Action a) { action = a; } + void operator=(Action a) { action = a; } - template Definition &operator,(T fn) { - operator=(fn); - return *this; - } + template Definition &operator,(T fn) { + operator=(fn); + return *this; + } - Definition &operator~() { - ignoreSemanticValue = true; - return *this; - } + Definition &operator~() { + ignoreSemanticValue = true; + return *this; + } - void accept(Ope::Visitor &v) { holder_->accept(v); } + void accept(Ope::Visitor &v) { holder_->accept(v); } - std::shared_ptr get_core_operator() const { return holder_->ope_; } + std::shared_ptr get_core_operator() const { return holder_->ope_; } - bool is_token() const { - std::call_once(is_token_init_, [this]() { - is_token_ = TokenChecker::is_token(*get_core_operator()); - }); - return is_token_; - } + bool is_token() const { + std::call_once(is_token_init_, [this]() { + is_token_ = TokenChecker::is_token(*get_core_operator()); + }); + return is_token_; + } - std::string name; - const char *s_ = nullptr; - std::pair line_ = {1, 1}; + std::string name; + const char *s_ = nullptr; + std::pair line_ = {1, 1}; - std::function - predicate; + Predicate predicate; - size_t id = 0; - Action action; - std::function - enter; - std::function - leave; - bool ignoreSemanticValue = false; - std::shared_ptr whitespaceOpe; - std::shared_ptr wordOpe; - bool enablePackratParsing = false; - bool is_macro = false; - std::vector params; - bool disable_action = false; + size_t id = 0; + Action action; + std::function + enter; + std::function + leave; + bool ignoreSemanticValue = false; + std::shared_ptr whitespaceOpe; + std::shared_ptr wordOpe; + bool enablePackratParsing = false; + bool is_macro = false; + std::vector params; + bool disable_action = false; + bool is_left_recursive = false; + bool can_be_empty = false; - TracerEnter tracer_enter; - TracerLeave tracer_leave; - bool verbose_trace = false; - TracerStartOrEnd tracer_start; - TracerStartOrEnd tracer_end; + TracerEnter tracer_enter; + TracerLeave tracer_leave; + bool verbose_trace = false; + TracerStartOrEnd tracer_start; + TracerStartOrEnd tracer_end; - std::string error_message; - bool no_ast_opt = false; + std::string error_message; + bool no_ast_opt = false; - bool eoi_check = true; + bool eoi_check = true; + + // Per-rule packrat stats (optional, for profiling) + mutable bool collect_packrat_stats = false; + mutable std::vector packrat_stats_; private: - friend class Reference; - friend class ParserGenerator; + friend class Reference; + friend class ParserGenerator; - Definition &operator=(const Definition &rhs); - Definition &operator=(Definition &&rhs); + Definition &operator=(const Definition &rhs); + Definition &operator=(Definition &&rhs); - void initialize_definition_ids() const { - std::call_once(definition_ids_init_, [&]() { - AssignIDToDefinition vis; - holder_->accept(vis); - if (whitespaceOpe) { whitespaceOpe->accept(vis); } - if (wordOpe) { wordOpe->accept(vis); } - definition_ids_.swap(vis.ids); - }); + void initialize_definition_ids() const { + std::call_once(definition_ids_init_, [&]() { + AssignIDToDefinition vis; + holder_->accept(vis); + if (whitespaceOpe) { whitespaceOpe->accept(vis); } + if (wordOpe) { wordOpe->accept(vis); } + definition_ids_.swap(vis.ids); + }); + } + + void initialize_packrat_filter() const; + + Result parse_core(const char *s, size_t n, SemanticValues &vs, std::any &dt, + const char *path, Log log) const { + initialize_definition_ids(); + + std::shared_ptr ope = holder_; + + std::any trace_data; + if (tracer_start) { tracer_start(trace_data); } + auto se = scope_exit([&]() { + if (tracer_end) { tracer_end(trace_data); } + }); + + Context c(path, s, n, definition_ids_.size(), whitespaceOpe, wordOpe, + enablePackratParsing, tracer_enter, tracer_leave, trace_data, + verbose_trace, log); + + if (collect_packrat_stats) { + packrat_stats_.resize(definition_ids_.size()); + c.packrat_stats = &packrat_stats_; } - Result parse_core(const char *s, size_t n, SemanticValues &vs, std::any &dt, - const char *path, Log log) const { - initialize_definition_ids(); - - std::shared_ptr ope = holder_; - - std::any trace_data; - if (tracer_start) { tracer_start(trace_data); } - auto se1 = scope_exit([&]() { - if (tracer_end) { tracer_end(trace_data); } - }); - - Context c(path, s, n, definition_ids_.size(), whitespaceOpe, wordOpe, - enablePackratParsing, tracer_enter, tracer_leave, trace_data, - verbose_trace, log); - - size_t i = 0; - - if (whitespaceOpe) { - auto save_ignore_trace_state = c.ignore_trace_state; - c.ignore_trace_state = !c.verbose_trace; - auto se2 = - scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); - - auto len = whitespaceOpe->parse(s, n, vs, c, dt); - if (fail(len)) { return Result{false, c.recovered, i, c.error_info}; } - - i = len; - } - - auto len = ope->parse(s + i, n - i, vs, c, dt); - auto ret = success(len); - if (ret) { - i += len; - if (eoi_check) { - if (i < n) { - if (c.error_info.error_pos - c.s < s + i - c.s) { - c.error_info.message_pos = s + i; - c.error_info.message = "expected end of input"; - } - ret = false; - } - } - } - return Result{ret, c.recovered, i, c.error_info}; + if (enablePackratParsing) { + initialize_packrat_filter(); + if (!packrat_filter_.empty()) { + c.packrat_rule_filter = &packrat_filter_; + } } - std::shared_ptr holder_; - mutable std::once_flag is_token_init_; - mutable bool is_token_ = false; - mutable std::once_flag assign_id_to_definition_init_; - mutable std::once_flag definition_ids_init_; - mutable std::unordered_map definition_ids_; + size_t i = 0; + + if (whitespaceOpe) { + auto save_ignore_trace_state = c.ignore_trace_state; + c.ignore_trace_state = !c.verbose_trace; + auto se = + scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); + + auto len = whitespaceOpe->parse(s, n, vs, c, dt); + if (fail(len)) { return Result{false, c.recovered, i, c.error_info}; } + + i = len; + } + + auto len = ope->parse(s + i, n - i, vs, c, dt); + auto ret = success(len); + if (ret) { + i += len; + if (eoi_check) { + if (i < n) { + if (c.error_info.error_pos - c.s < s + i - c.s) { + c.error_info.message_pos = s + i; + c.error_info.message = "expected end of input"; + } + ret = false; + } + } + } + return Result{ret, c.recovered, i, c.error_info}; + } + + std::shared_ptr holder_; + mutable std::once_flag is_token_init_; + mutable bool is_token_ = false; + mutable std::once_flag assign_id_to_definition_init_; + mutable std::once_flag definition_ids_init_; + mutable std::unordered_map definition_ids_; + mutable std::once_flag packrat_filter_init_; + mutable std::vector packrat_filter_; }; /* @@ -2487,592 +2876,704 @@ private: inline size_t parse_literal(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt, const std::string &lit, std::once_flag &init_is_word, bool &is_word, - bool ignore_case) { - size_t i = 0; - for (; i < lit.size(); i++) { - if (i >= n || (ignore_case ? (std::tolower(s[i]) != std::tolower(lit[i])) - : (s[i] != lit[i]))) { - c.set_error_pos(s, lit.data()); - return static_cast(-1); - } + bool ignore_case, const std::string &lower_lit) { + size_t i = 0; + for (; i < lit.size(); i++) { + if (i >= n || + (ignore_case ? (static_cast(std::tolower( + static_cast(s[i]))) != lower_lit[i]) + : (s[i] != lit[i]))) { + c.set_error_pos(s, lit.data()); + return static_cast(-1); } + } - // Word check - if (c.wordOpe) { - auto save_ignore_trace_state = c.ignore_trace_state; - c.ignore_trace_state = !c.verbose_trace; - auto se = - scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); + // Word check + if (c.wordOpe) { + auto save_ignore_trace_state = c.ignore_trace_state; + c.ignore_trace_state = !c.verbose_trace; + auto se = + scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); - std::call_once(init_is_word, [&]() { - SemanticValues dummy_vs; - Context dummy_c(nullptr, c.s, c.l, 0, nullptr, nullptr, false, nullptr, - nullptr, nullptr, false, nullptr); - std::any dummy_dt; + std::call_once(init_is_word, [&]() { + SemanticValues dummy_vs; + Context dummy_c(nullptr, c.s, c.l, 0, nullptr, nullptr, false, nullptr, + nullptr, nullptr, false, nullptr); + std::any dummy_dt; - auto len = - c.wordOpe->parse(lit.data(), lit.size(), dummy_vs, dummy_c, dummy_dt); - is_word = success(len); - }); + auto len = + c.wordOpe->parse(lit.data(), lit.size(), dummy_vs, dummy_c, dummy_dt); + is_word = success(len); + }); - if (is_word) { - SemanticValues dummy_vs; - Context dummy_c(nullptr, c.s, c.l, 0, nullptr, nullptr, false, nullptr, - nullptr, nullptr, false, nullptr); - std::any dummy_dt; + if (is_word) { + SemanticValues dummy_vs; + Context dummy_c(nullptr, c.s, c.l, 0, nullptr, nullptr, false, nullptr, + nullptr, nullptr, false, nullptr); + std::any dummy_dt; - NotPredicate ope(c.wordOpe); - auto len = ope.parse(s + i, n - i, dummy_vs, dummy_c, dummy_dt); - if (fail(len)) { - c.set_error_pos(s, lit.data()); - return len; - } - i += len; - } + NotPredicate ope(c.wordOpe); + auto len = ope.parse(s + i, n - i, dummy_vs, dummy_c, dummy_dt); + if (fail(len)) { + c.set_error_pos(s, lit.data()); + return len; + } + i += len; } + } - // Skip whitespace - if (!c.in_token_boundary_count && c.whitespaceOpe) { - auto save_ignore_trace_state = c.ignore_trace_state; - c.ignore_trace_state = !c.verbose_trace; - auto se = - scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); + // Skip whitespace + auto wl = c.skip_whitespace(s + i, n - i, vs, dt); + if (fail(wl)) { return wl; } + i += wl; - auto len = c.whitespaceOpe->parse(s + i, n - i, vs, c, dt); - if (fail(len)) { return len; } - i += len; - } - - return i; + return i; } inline std::pair SemanticValues::line_info() const { - assert(c_); - return c_->line_info(sv_.data()); + assert(c_); + return c_->line_info(sv_.data()); } inline void ErrorInfo::output_log(const Log &log, const char *s, size_t n) { - if (message_pos) { - if (message_pos > last_output_pos) { - last_output_pos = message_pos; - auto line = line_info(s, message_pos); - std::string msg; - if (auto unexpected_token = heuristic_error_token(s, n, message_pos); - !unexpected_token.empty()) { - msg = replace_all(message, "%t", unexpected_token); + if (message_pos) { + if (message_pos > last_output_pos) { + last_output_pos = message_pos; + auto line = line_info(s, message_pos); + std::string msg; + if (auto unexpected_token = heuristic_error_token(s, n, message_pos); + !unexpected_token.empty()) { + msg = replace_all(message, "%t", unexpected_token); - auto unexpected_char = unexpected_token.substr( - 0, - codepoint_length(unexpected_token.data(), unexpected_token.size())); + auto unexpected_char = unexpected_token.substr( + 0, + codepoint_length(unexpected_token.data(), unexpected_token.size())); - msg = replace_all(msg, "%c", unexpected_char); - } else { - msg = message; - } - log(line.first, line.second, msg, label); - } - } else if (error_pos) { - if (error_pos > last_output_pos) { - last_output_pos = error_pos; - auto line = line_info(s, error_pos); - - std::string msg; - if (expected_tokens.empty()) { - msg = "syntax error."; - } else { - msg = "syntax error"; - - // unexpected token - if (auto unexpected_token = heuristic_error_token(s, n, error_pos); - !unexpected_token.empty()) { - msg += ", unexpected '"; - msg += unexpected_token; - msg += "'"; - } - - auto first_item = true; - size_t i = 0; - while (i < expected_tokens.size()) { - auto [error_literal, error_rule] = expected_tokens[i]; - - // Skip rules start with '_' - if (!(error_rule && error_rule->name[0] == '_')) { - msg += (first_item ? ", expecting " : ", "); - if (error_literal) { - msg += "'"; - msg += error_literal; - msg += "'"; - } else { - msg += "<" + error_rule->name + ">"; - if (label.empty()) { label = error_rule->name; } - } - first_item = false; - } - - i++; - } - msg += "."; - } - log(line.first, line.second, msg, label); - } + msg = replace_all(msg, "%c", unexpected_char); + } else { + msg = message; + } + log(line.first, line.second, msg, label); } + } else if (error_pos) { + if (error_pos > last_output_pos) { + last_output_pos = error_pos; + auto line = line_info(s, error_pos); + + std::string msg; + if (expected_tokens.empty()) { + msg = "syntax error."; + } else { + msg = "syntax error"; + + // unexpected token + if (auto unexpected_token = heuristic_error_token(s, n, error_pos); + !unexpected_token.empty()) { + msg += ", unexpected '"; + msg += unexpected_token; + msg += "'"; + } + + auto first_item = true; + size_t i = 0; + while (i < expected_tokens.size()) { + auto [error_literal, error_rule] = expected_tokens[i]; + + // Skip rules start with '_' + if (!(error_rule && error_rule->name[0] == '_')) { + msg += (first_item ? ", expecting " : ", "); + if (error_literal) { + msg += "'"; + msg += error_literal; + msg += "'"; + } else { + msg += "<" + error_rule->name + ">"; + if (label.empty()) { label = error_rule->name; } + } + first_item = false; + } + + i++; + } + msg += "."; + } + log(line.first, line.second, msg, label); + } + } +} + +inline size_t Context::skip_whitespace(const char *a_s, size_t n, + SemanticValues &vs, std::any &dt) { + if (in_token_boundary_count || !whitespaceOpe) { return 0; } + auto save = ignore_trace_state; + ignore_trace_state = !verbose_trace; + auto se = scope_exit([&]() { ignore_trace_state = save; }); + return whitespaceOpe->parse(a_s, n, vs, *this, dt); } inline void Context::set_error_pos(const char *a_s, const char *literal) { - if (log) { - if (error_info.error_pos <= a_s) { - if (error_info.error_pos < a_s || !error_info.keep_previous_token) { - error_info.error_pos = a_s; - error_info.expected_tokens.clear(); - } + if (log) { + if (error_info.error_pos <= a_s) { + if (error_info.error_pos < a_s || !error_info.keep_previous_token) { + error_info.error_pos = a_s; + error_info.expected_tokens.clear(); + } - const char *error_literal = nullptr; - const Definition *error_rule = nullptr; + const char *error_literal = nullptr; + const Definition *error_rule = nullptr; - if (literal) { - error_literal = literal; - } else if (!rule_stack.empty()) { - auto rule = rule_stack.back(); - auto ope = rule->get_core_operator(); - if (auto token = FindLiteralToken::token(*ope); - token && token[0] != '\0') { - error_literal = token; - } - } - - for (auto r : rule_stack) { - error_rule = r; - if (r->is_token()) { break; } - } - - if (error_literal || error_rule) { - error_info.add(error_literal, error_rule); - } + if (literal) { + error_literal = literal; + } else if (!rule_stack.empty()) { + auto rule = rule_stack.back(); + auto ope = rule->get_core_operator(); + if (auto token = FindLiteralToken::token(*ope); + token && token[0] != '\0') { + error_literal = token; } + } + + for (auto r : rule_stack) { + error_rule = r; + if (r->is_token()) { break; } + } + + if (error_literal || error_rule) { + error_info.add(error_literal, error_rule); + } } + } } inline void Context::trace_enter(const Ope &ope, const char *a_s, size_t n, const SemanticValues &vs, std::any &dt) { - trace_ids.push_back(next_trace_id++); - tracer_enter(ope, a_s, n, vs, *this, dt, trace_data); + trace_ids.push_back(next_trace_id++); + tracer_enter(ope, a_s, n, vs, *this, dt, trace_data); } inline void Context::trace_leave(const Ope &ope, const char *a_s, size_t n, const SemanticValues &vs, std::any &dt, size_t len) { - tracer_leave(ope, a_s, n, vs, *this, dt, len, trace_data); - trace_ids.pop_back(); + tracer_leave(ope, a_s, n, vs, *this, dt, len, trace_data); + trace_ids.pop_back(); } inline bool Context::is_traceable(const Ope &ope) const { - if (tracer_enter && tracer_leave) { - if (ignore_trace_state) { return false; } - return !dynamic_cast(&ope); - } - return false; + if (tracer_enter && tracer_leave) { + if (ignore_trace_state) { return false; } + return !dynamic_cast(&ope); + } + return false; } inline size_t Ope::parse(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { - if (c.is_traceable(*this)) { - c.trace_enter(*this, s, n, vs, dt); - auto len = parse_core(s, n, vs, c, dt); - c.trace_leave(*this, s, n, vs, dt, len); - return len; - } - return parse_core(s, n, vs, c, dt); + if (c.is_traceable(*this)) { + c.trace_enter(*this, s, n, vs, dt); + auto len = parse_core(s, n, vs, c, dt); + c.trace_leave(*this, s, n, vs, dt, len); + return len; + } + return parse_core(s, n, vs, c, dt); } inline size_t Dictionary::parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { - size_t id; - auto i = trie_.match(s, n, id); + size_t id; + auto i = trie_.match(s, n, id); - if (i == 0) { + if (i == 0) { + c.set_error_pos(s); + return static_cast(-1); + } + + vs.choice_count_ = trie_.items_count(); + vs.choice_ = id; + + // Word check + if (c.wordOpe) { + auto save_ignore_trace_state = c.ignore_trace_state; + c.ignore_trace_state = !c.verbose_trace; + auto se = + scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); + + { + SemanticValues dummy_vs; + Context dummy_c(nullptr, c.s, c.l, 0, nullptr, nullptr, false, nullptr, + nullptr, nullptr, false, nullptr); + std::any dummy_dt; + + NotPredicate ope(c.wordOpe); + auto len = ope.parse(s + i, n - i, dummy_vs, dummy_c, dummy_dt); + if (fail(len)) { c.set_error_pos(s); - return static_cast(-1); + return len; + } + i += len; } + } - vs.choice_count_ = trie_.size(); - vs.choice_ = id; + // Skip whitespace + auto wl = c.skip_whitespace(s + i, n - i, vs, dt); + if (fail(wl)) { return wl; } + i += wl; - // Word check - if (c.wordOpe) { - auto save_ignore_trace_state = c.ignore_trace_state; - c.ignore_trace_state = !c.verbose_trace; - auto se = - scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); - - { - SemanticValues dummy_vs; - Context dummy_c(nullptr, c.s, c.l, 0, nullptr, nullptr, false, nullptr, - nullptr, nullptr, false, nullptr); - std::any dummy_dt; - - NotPredicate ope(c.wordOpe); - auto len = ope.parse(s + i, n - i, dummy_vs, dummy_c, dummy_dt); - if (fail(len)) { - c.set_error_pos(s); - return len; - } - i += len; - } - } - - // Skip whitespace - if (!c.in_token_boundary_count && c.whitespaceOpe) { - auto save_ignore_trace_state = c.ignore_trace_state; - c.ignore_trace_state = !c.verbose_trace; - auto se = - scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); - - auto len = c.whitespaceOpe->parse(s + i, n - i, vs, c, dt); - if (fail(len)) { return len; } - i += len; - } - - return i; + return i; } inline size_t LiteralString::parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { - return parse_literal(s, n, vs, c, dt, lit_, init_is_word_, is_word_, - ignore_case_); + return parse_literal(s, n, vs, c, dt, lit_, init_is_word_, is_word_, + ignore_case_, lower_lit_); } inline size_t TokenBoundary::parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { - auto save_ignore_trace_state = c.ignore_trace_state; - c.ignore_trace_state = !c.verbose_trace; - auto se1 = - scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); + auto save_ignore_trace_state = c.ignore_trace_state; + c.ignore_trace_state = !c.verbose_trace; + auto se = + scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); - size_t len; - { - c.in_token_boundary_count++; - auto se2 = scope_exit([&]() { c.in_token_boundary_count--; }); - len = ope_->parse(s, n, vs, c, dt); - } + size_t len; + { + c.in_token_boundary_count++; + auto se = scope_exit([&]() { c.in_token_boundary_count--; }); + len = ope_->parse(s, n, vs, c, dt); + } - if (success(len)) { - vs.tokens.emplace_back(std::string_view(s, len)); + if (success(len)) { + vs.tokens.emplace_back(std::string_view(s, len)); - if (!c.in_token_boundary_count) { - if (c.whitespaceOpe) { - auto l = c.whitespaceOpe->parse(s + len, n - len, vs, c, dt); - if (fail(l)) { return l; } - len += l; - } - } - } - return len; + auto wl = c.skip_whitespace(s + len, n - len, vs, dt); + if (fail(wl)) { return wl; } + len += wl; + } + return len; } inline size_t Holder::parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { - if (!ope_) { - throw std::logic_error("Uninitialized definition ope was used..."); - } + if (!ope_) { + throw std::logic_error("Uninitialized definition ope was used..."); + } - // Macro reference - if (outer_->is_macro) { - c.rule_stack.push_back(outer_); - auto len = ope_->parse(s, n, vs, c, dt); - c.rule_stack.pop_back(); - return len; - } + // Macro reference + if (outer_->is_macro) { + c.rule_stack.push_back(outer_); + auto len = ope_->parse(s, n, vs, c, dt); + c.rule_stack.pop_back(); + return len; + } - size_t len; - std::any val; + size_t len; + std::any val; - c.packrat(s, outer_->id, len, val, [&](std::any &a_val) { - if (outer_->enter) { outer_->enter(c, s, n, dt); } - auto &chvs = c.push_semantic_values_scope(); - auto se = scope_exit([&]() { - c.pop_semantic_values_scope(); - if (outer_->leave) { outer_->leave(c, s, n, len, a_val, dt); } - }); + // Shared parse body: invokes enter/leave callbacks, parses the rule's + // operator, handles actions/predicates/errors, and calls reduce. + // Returns {parse_len, parse_val}. + auto do_parse = [&]() { + size_t parse_len; + std::any parse_val; - c.rule_stack.push_back(outer_); - len = ope_->parse(s, n, chvs, c, dt); - c.rule_stack.pop_back(); - - // Invoke action - if (success(len)) { - chvs.sv_ = std::string_view(s, len); - chvs.name_ = outer_->name; - - auto ope_ptr = ope_.get(); - { - auto tok_ptr = dynamic_cast(ope_ptr); - if (tok_ptr) { ope_ptr = tok_ptr->ope_.get(); } - } - if (!dynamic_cast(ope_ptr) && - !dynamic_cast(ope_ptr)) { - chvs.choice_count_ = 0; - chvs.choice_ = 0; - } - - std::string msg; - if (outer_->predicate && !outer_->predicate(chvs, dt, msg)) { - if (c.log && !msg.empty() && c.error_info.message_pos < s) { - c.error_info.message_pos = s; - c.error_info.message = msg; - c.error_info.label = outer_->name; - } - len = static_cast(-1); - } - - if (success(len)) { - if (!c.recovered) { a_val = reduce(chvs, dt); } - } else { - if (c.log && !msg.empty() && c.error_info.message_pos < s) { - c.error_info.message_pos = s; - c.error_info.message = msg; - c.error_info.label = outer_->name; - } - } - } else { - if (c.log && !outer_->error_message.empty() && - c.error_info.message_pos < s) { - c.error_info.message_pos = s; - c.error_info.message = outer_->error_message; - c.error_info.label = outer_->name; - } - } + if (outer_->enter) { outer_->enter(c, s, n, dt); } + auto &chvs = c.push_semantic_values_scope(); + auto se = scope_exit([&]() { + c.pop_semantic_values_scope(); + if (outer_->leave) { outer_->leave(c, s, n, parse_len, parse_val, dt); } }); - if (success(len)) { - if (!outer_->ignoreSemanticValue) { - vs.emplace_back(std::move(val)); - vs.tags.emplace_back(str2tag(outer_->name)); + c.rule_stack.push_back(outer_); + parse_len = ope_->parse(s, n, chvs, c, dt); + c.rule_stack.pop_back(); + + if (success(parse_len)) { + chvs.sv_ = std::string_view(s, parse_len); + chvs.name_ = outer_->name; + + auto ope_ptr = ope_.get(); + if (ope_ptr->is_token_boundary) { + ope_ptr = static_cast(ope_ptr)->ope_.get(); + } + if (!ope_ptr->is_choice_like) { + chvs.choice_count_ = 0; + chvs.choice_ = 0; + } + + std::string msg; + std::any predicate_data; + if (outer_->predicate) { + if (!outer_->predicate(chvs, dt, msg, predicate_data)) { + if (c.log && !msg.empty() && c.error_info.message_pos < s) { + c.error_info.message_pos = s; + c.error_info.message = msg; + c.error_info.label = outer_->name; + } + parse_len = static_cast(-1); } + } + + if (success(parse_len)) { + if (!c.recovered) { parse_val = reduce(chvs, dt, predicate_data); } + } else { + if (c.log && !msg.empty() && c.error_info.message_pos < s) { + c.error_info.message_pos = s; + c.error_info.message = msg; + c.error_info.label = outer_->name; + } + } + } else { + if (c.log && !outer_->error_message.empty() && + c.error_info.message_pos < s) { + c.error_info.message_pos = s; + c.error_info.message = outer_->error_message; + c.error_info.label = outer_->name; + } } - return len; + return std::make_pair(parse_len, std::move(parse_val)); + }; + + if (outer_->is_left_recursive) { + auto lr_key = std::make_pair(outer_, s); + + // Check LR memo first + auto it = c.lr_memo.find(lr_key); + if (it != c.lr_memo.end()) { + if (success(it->second.len)) { + len = it->second.len; + val = it->second.val; + } else { + len = static_cast(-1); + } + // Record that this rule's lr_memo was accessed. + // Any LR rule currently seeding will know we're in its cycle. + c.lr_refs_hit.insert(outer_); + } else { + // Seed with FAIL + c.lr_memo[lr_key] = {static_cast(-1), {}}; + + // Mark as active seed (protects our lr_memo from inner growers) + c.lr_active_seeds.insert(lr_key); + auto seed_guard = scope_exit([&]() { c.lr_active_seeds.erase(lr_key); }); + + // Track which LR rules are referenced during our parse + // to identify cycle members + auto saved_refs = std::move(c.lr_refs_hit); + c.lr_refs_hit.clear(); + + // Initial parse (self-references will hit the FAIL seed) + auto [initial_len, initial_val] = do_parse(); + + // Rules whose lr_memo was hit during our parse are in our cycle. + // If we detected cycle members, we ourselves are also part of + // the cycle, so add self — this lets parent seeders see us as + // a transitive cycle member. + auto cycle_rules = c.lr_refs_hit; + if (!cycle_rules.empty()) { cycle_rules.insert(outer_); } + + // Restore parent's refs and propagate cycle info upward + c.lr_refs_hit = std::move(saved_refs); + c.lr_refs_hit.insert(cycle_rules.begin(), cycle_rules.end()); + + if (!success(initial_len)) { + // Keep FAIL in lr_memo so we don't re-seed + len = static_cast(-1); + } else { + // Got initial seed, now grow + len = initial_len; + val = std::move(initial_val); + c.lr_memo[lr_key] = {len, val}; + + while (true) { + // Clear this rule's packrat cache + c.clear_packrat_cache(s, outer_->id); + + // Clear lr_memo for cycle-dependent rules at this position, + // but NOT for rules currently in their own seeding phase + // (lr_active_seeds) — those are outer growers we must not + // interfere with. + for (auto memo_it = c.lr_memo.begin(); memo_it != c.lr_memo.end();) { + if (memo_it->first.second == s && memo_it->first.first != outer_ && + cycle_rules.count(memo_it->first.first) && + !c.lr_active_seeds.count(memo_it->first)) { + memo_it = c.lr_memo.erase(memo_it); + } else { + ++memo_it; + } + } + + auto [new_len, new_val] = do_parse(); + + if (!success(new_len) || new_len <= len) { + break; // No improvement, done growing + } + + len = new_len; + val = std::move(new_val); + c.lr_memo[lr_key] = {len, val}; + } + } + + // Write final result to packrat cache (lr_memo entry is kept as + // the primary lookup for LR rules at this position) + if (success(len)) { c.write_packrat_cache(s, outer_->id, len, val); } + } + } else { + if (c.enablePackratParsing) { + // Packrat cache acts as re-entry guard (pre-registered as + // failure before fn is called). + c.packrat(s, outer_->id, len, val, [&](std::any &a_val) { + auto [parse_len, parse_val] = do_parse(); + len = parse_len; + if (success(len)) { a_val = std::move(parse_val); } + }); + } else { + // Without packrat, use lr_memo as re-entry guard to prevent + // stack overflow from undetected left recursion. + auto guard_key = std::make_pair(outer_, s); + if (c.lr_memo.count(guard_key)) { + len = static_cast(-1); + } else { + c.lr_memo[guard_key] = {static_cast(-1), {}}; + auto [parse_len, parse_val] = do_parse(); + len = parse_len; + val = std::move(parse_val); + c.lr_memo.erase(guard_key); + } + } + } + + if (success(len)) { + if (!outer_->ignoreSemanticValue) { + vs.emplace_back(std::move(val)); + vs.tags.emplace_back(str2tag(outer_->name)); + } + } + + return len; } -inline std::any Holder::reduce(SemanticValues &vs, std::any &dt) const { - if (outer_->action && !outer_->disable_action) { - return outer_->action(vs, dt); - } else if (vs.empty()) { - return std::any(); - } else { - return std::move(vs.front()); - } +inline std::any Holder::reduce(SemanticValues &vs, std::any &dt, + const std::any &predicate_data) const { + if (outer_->action && !outer_->disable_action) { + return outer_->action(vs, dt, predicate_data); + } else if (vs.empty()) { + return std::any(); + } else { + return std::move(vs.front()); + } } inline const std::string &Holder::name() const { return outer_->name; } inline const std::string &Holder::trace_name() const { - std::call_once(trace_name_init_, - [this]() { trace_name_ = "[" + outer_->name + "]"; }); - return trace_name_; + std::call_once(trace_name_init_, + [this]() { trace_name_ = "[" + outer_->name + "]"; }); + return trace_name_; } inline size_t Reference::parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { - auto save_ignore_trace_state = c.ignore_trace_state; - if (rule_ && rule_->ignoreSemanticValue) { - c.ignore_trace_state = !c.verbose_trace; - } - auto se1 = - scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); + auto save_ignore_trace_state = c.ignore_trace_state; + if (rule_ && rule_->ignoreSemanticValue) { + c.ignore_trace_state = !c.verbose_trace; + } + auto se = + scope_exit([&]() { c.ignore_trace_state = save_ignore_trace_state; }); - if (rule_) { - // Reference rule - if (rule_->is_macro) { - // Macro - FindReference vis(c.top_args(), c.rule_stack.back()->params); + if (rule_) { + // Reference rule + if (rule_->is_macro) { + // Macro + FindReference vis(c.top_args(), c.rule_stack.back()->params); - // Collect arguments - std::vector> args; - for (auto arg : args_) { - arg->accept(vis); - args.emplace_back(std::move(vis.found_ope)); - } + // Collect arguments + std::vector> args; + for (const auto &arg : args_) { + arg->accept(vis); + args.emplace_back(std::move(vis.found_ope)); + } - c.push_args(std::move(args)); - auto se2 = scope_exit([&]() { c.pop_args(); }); - auto ope = get_core_operator(); - return ope->parse(s, n, vs, c, dt); - } else { - // Definition - c.push_args(std::vector>()); - auto se3 = scope_exit([&]() { c.pop_args(); }); - auto ope = get_core_operator(); - return ope->parse(s, n, vs, c, dt); - } + c.push_args(std::move(args)); + auto se = scope_exit([&]() { c.pop_args(); }); + return rule_->holder_->parse(s, n, vs, c, dt); } else { - // Reference parameter in macro - const auto &args = c.top_args(); - return args[iarg_]->parse(s, n, vs, c, dt); + // Definition + c.push_args(std::vector>()); + auto se2 = scope_exit([&]() { c.pop_args(); }); + return rule_->holder_->parse(s, n, vs, c, dt); } + } else { + // Reference parameter in macro + const auto &args = c.top_args(); + return args[iarg_]->parse(s, n, vs, c, dt); + } } inline std::shared_ptr Reference::get_core_operator() const { - return rule_->holder_; + return rule_->holder_; } inline size_t BackReference::parse_core(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt) const { - auto size = static_cast(c.capture_scope_stack_size); - for (auto i = size - 1; i >= 0; i--) { - auto index = static_cast(i); - const auto &cs = c.capture_scope_stack[index]; - if (cs.find(name_) != cs.end()) { - const auto &lit = cs.at(name_); - std::once_flag init_is_word; - auto is_word = false; - return parse_literal(s, n, vs, c, dt, lit, init_is_word, is_word, false); - } + for (auto it = c.capture_entries.rbegin(); it != c.capture_entries.rend(); + ++it) { + if (it->first == name_) { + const auto &lit = it->second; + std::once_flag init_is_word; + auto is_word = false; + static const std::string empty; + return parse_literal(s, n, vs, c, dt, lit, init_is_word, is_word, false, + empty); } + } - c.error_info.message_pos = s; - c.error_info.message = "undefined back reference '$" + name_ + "'..."; - return static_cast(-1); + c.error_info.message_pos = s; + c.error_info.message = "undefined back reference '$" + name_ + "'..."; + return static_cast(-1); } inline Definition & PrecedenceClimbing::get_reference_for_binop(Context &c) const { - if (rule_.is_macro) { - // Reference parameter in macro - const auto &args = c.top_args(); - auto iarg = dynamic_cast(*binop_).iarg_; - auto arg = args[iarg]; - return *dynamic_cast(*arg).rule_; - } + if (rule_.is_macro) { + // Reference parameter in macro + const auto &args = c.top_args(); + auto iarg = dynamic_cast(*binop_).iarg_; + auto arg = args[iarg]; + return *dynamic_cast(*arg).rule_; + } - return *dynamic_cast(*binop_).rule_; + return *dynamic_cast(*binop_).rule_; } inline size_t PrecedenceClimbing::parse_expression(const char *s, size_t n, SemanticValues &vs, Context &c, std::any &dt, size_t min_prec) const { - auto len = atom_->parse(s, n, vs, c, dt); - if (fail(len)) { return len; } + auto len = atom_->parse(s, n, vs, c, dt); + if (fail(len)) { return len; } - std::string tok; - auto &rule = get_reference_for_binop(c); - auto action = std::move(rule.action); + std::string tok; + auto &rule = get_reference_for_binop(c); + auto action = std::move(rule.action); - rule.action = [&](SemanticValues &vs2, std::any &dt2) { - tok = vs2.token(); - if (action) { - return action(vs2, dt2); - } else if (!vs2.empty()) { - return vs2[0]; - } - return std::any(); - }; - auto action_se = scope_exit([&]() { rule.action = std::move(action); }); + rule.action = [&](SemanticValues &vs2, std::any &dt2, + const std::any &predicate_data2) { + tok = vs2.token(); + if (action) { + return action(vs2, dt2, predicate_data2); + } else if (!vs2.empty()) { + return vs2[0]; + } + return std::any(); + }; + auto action_se = scope_exit([&]() { rule.action = std::move(action); }); - auto i = len; - while (i < n) { - std::vector save_values(vs.begin(), vs.end()); - auto save_tokens = vs.tokens; + auto i = len; + while (i < n) { + std::vector save_values(vs.begin(), vs.end()); + auto save_tokens = vs.tokens; - auto chvs = c.push_semantic_values_scope(); - auto chlen = binop_->parse(s + i, n - i, chvs, c, dt); - c.pop_semantic_values_scope(); + auto chvs = c.push_semantic_values_scope(); + auto chlen = binop_->parse(s + i, n - i, chvs, c, dt); + c.pop_semantic_values_scope(); - if (fail(chlen)) { break; } + if (fail(chlen)) { break; } - auto it = info_.find(tok); - if (it == info_.end()) { break; } + auto it = info_.find(tok); + if (it == info_.end()) { break; } - auto level = std::get<0>(it->second); - auto assoc = std::get<1>(it->second); + auto level = std::get<0>(it->second); + auto assoc = std::get<1>(it->second); - if (level < min_prec) { break; } + if (level < min_prec) { break; } - vs.emplace_back(std::move(chvs[0])); - i += chlen; + vs.emplace_back(std::move(chvs[0])); + i += chlen; - auto next_min_prec = level; - if (assoc == 'L') { next_min_prec = level + 1; } + auto next_min_prec = level; + if (assoc == 'L') { next_min_prec = level + 1; } - chvs = c.push_semantic_values_scope(); - chlen = parse_expression(s + i, n - i, chvs, c, dt, next_min_prec); - c.pop_semantic_values_scope(); + chvs = c.push_semantic_values_scope(); + chlen = parse_expression(s + i, n - i, chvs, c, dt, next_min_prec); + c.pop_semantic_values_scope(); - if (fail(chlen)) { - vs.assign(save_values.begin(), save_values.end()); - vs.tokens = save_tokens; - i = chlen; - break; - } - - vs.emplace_back(std::move(chvs[0])); - i += chlen; - - std::any val; - if (rule_.action) { - vs.sv_ = std::string_view(s, i); - val = rule_.action(vs, dt); - } else if (!vs.empty()) { - val = vs[0]; - } - vs.clear(); - vs.emplace_back(std::move(val)); + if (fail(chlen)) { + vs.assign(save_values.begin(), save_values.end()); + vs.tokens = save_tokens; + i = chlen; + break; } - return i; + vs.emplace_back(std::move(chvs[0])); + i += chlen; + + std::any val; + if (rule_.action) { + vs.sv_ = std::string_view(s, i); + static const std::any empty_predicate_data; + val = rule_.action(vs, dt, empty_predicate_data); + } else if (!vs.empty()) { + val = vs[0]; + } + vs.clear(); + vs.emplace_back(std::move(val)); + } + + return i; } inline size_t Recovery::parse_core(const char *s, size_t n, SemanticValues & /*vs*/, Context &c, std::any & /*dt*/) const { - const auto &rule = dynamic_cast(*ope_); + const auto &rule = dynamic_cast(*ope_); + + // Custom error message + if (c.log) { + auto label = dynamic_cast(rule.args_[0].get()); + if (label && !label->rule_->error_message.empty()) { + c.error_info.message_pos = s; + c.error_info.message = label->rule_->error_message; + c.error_info.label = label->rule_->name; + } + } + + // Recovery + auto len = static_cast(-1); + { + auto save_log = c.log; + c.log = nullptr; + auto se = scope_exit([&]() { c.log = save_log; }); + + SemanticValues dummy_vs; + std::any dummy_dt; + + len = rule.parse(s, n, dummy_vs, c, dummy_dt); + } + + if (success(len)) { + c.recovered = true; - // Custom error message if (c.log) { - auto label = dynamic_cast(rule.args_[0].get()); - if (label && !label->rule_->error_message.empty()) { - c.error_info.message_pos = s; - c.error_info.message = label->rule_->error_message; - c.error_info.label = label->rule_->name; - } + c.error_info.output_log(c.log, c.s, c.l); + c.error_info.clear(); } + } - // Recovery - auto len = static_cast(-1); - { - auto save_log = c.log; - c.log = nullptr; - auto se = scope_exit([&]() { c.log = save_log; }); + // Cut + if (!c.cut_stack.empty()) { + c.cut_stack.back() = true; - SemanticValues dummy_vs; - std::any dummy_dt; - - len = rule.parse(s, n, dummy_vs, c, dummy_dt); + if (c.cut_stack.size() == 1) { + // TODO: Remove unneeded entries in packrat memoise table } + } - if (success(len)) { - c.recovered = true; - - if (c.log) { - c.error_info.output_log(c.log, c.s, c.l); - c.error_info.clear(); - } - } - - // Cut - if (!c.cut_stack.empty()) { - c.cut_stack.back() = true; - - if (c.cut_stack.size() == 1) { - //! \todo Remove unneeded entries in packrat memoise table - } - } - - return len; + return len; } inline void Sequence::accept(Visitor &v) { v.visit(*this); } @@ -3100,194 +3601,452 @@ inline void Recovery::accept(Visitor &v) { v.visit(*this); } inline void Cut::accept(Visitor &v) { v.visit(*this); } inline void AssignIDToDefinition::visit(Holder &ope) { - auto p = static_cast(ope.outer_); - if (ids.count(p)) { return; } - auto id = ids.size(); - ids[p] = id; - ope.outer_->id = id; - ope.ope_->accept(*this); + auto p = static_cast(ope.outer_); + if (ids.count(p)) { return; } + auto id = ids.size(); + ids[p] = id; + ope.outer_->id = id; + ope.ope_->accept(*this); } inline void AssignIDToDefinition::visit(Reference &ope) { - if (ope.rule_) { - for (auto arg : ope.args_) { - arg->accept(*this); - } - ope.rule_->accept(*this); + if (ope.rule_) { + for (const auto &arg : ope.args_) { + arg->accept(*this); } + ope.rule_->accept(*this); + } } inline void AssignIDToDefinition::visit(PrecedenceClimbing &ope) { - ope.atom_->accept(*this); - ope.binop_->accept(*this); + ope.atom_->accept(*this); + ope.binop_->accept(*this); } inline void TokenChecker::visit(Reference &ope) { - if (ope.is_macro_) { - for (auto arg : ope.args_) { - arg->accept(*this); - } - } else { - has_rule_ = true; + if (ope.is_macro_) { + for (const auto &arg : ope.args_) { + arg->accept(*this); } + } else { + has_rule_ = true; + } } inline void FindLiteralToken::visit(Reference &ope) { - if (ope.is_macro_) { - ope.rule_->accept(*this); - for (auto arg : ope.args_) { - arg->accept(*this); - } + if (ope.is_macro_) { + ope.rule_->accept(*this); + for (const auto &arg : ope.args_) { + arg->accept(*this); } + } +} + +inline void ComputeCanBeEmpty::visit(Reference &ope) { + result = ope.rule_ && ope.rule_->can_be_empty; } inline void DetectLeftRecursion::visit(Reference &ope) { - if (ope.name_ == name_) { - error_s = ope.s_; - } else if (!refs_.count(ope.name_)) { - refs_.insert(ope.name_); - if (ope.rule_) { - ope.rule_->accept(*this); - if (done_ == false) { return; } - } + if (ope.name_ == name_) { + error_s = ope.s_; + } else if (!ope.rule_ && !macro_args_stack_.empty()) { + // Macro parameter reference: resolve through nested macro arg + // stacks (e.g. B(X) <- C(X) where X is itself a param ref). + auto resolved = resolve_macro_arg(ope.iarg_); + if (resolved) { + resolved->accept(*this); + if (done_ == false) { return; } } - done_ = true; + } else if (!refs_.count(ope.name_)) { + refs_.insert(ope.name_); + if (ope.rule_) { + if (ope.is_macro_) { macro_args_stack_.push_back(&ope.args_); } + ope.rule_->accept(*this); + if (ope.is_macro_) { macro_args_stack_.pop_back(); } + if (done_ == false) { return; } + } + } + // If the referenced rule can match empty, don't mark as done — + // the sequence may continue past this element to find LR. + if (!ope.rule_ && !macro_args_stack_.empty()) { + auto resolved = resolve_macro_arg(ope.iarg_); + if (resolved) { + ComputeCanBeEmpty cbe; + resolved->accept(cbe); + done_ = !cbe.result; + } else { + done_ = true; + } + } else { + done_ = !(ope.rule_ && ope.rule_->can_be_empty); + } +} + +inline std::shared_ptr +DetectLeftRecursion::resolve_macro_arg(size_t iarg) const { + for (int i = static_cast(macro_args_stack_.size()) - 1; i >= 0; i--) { + auto &args = *macro_args_stack_[i]; + if (iarg >= args.size()) { return nullptr; } + auto ref = dynamic_cast(args[iarg].get()); + if (ref && !ref->rule_) { + // Another param ref — resolve using parent level's args + iarg = ref->iarg_; + continue; + } + return args[iarg]; + } + return nullptr; } inline void HasEmptyElement::visit(Sequence &ope) { - auto save_is_empty = false; - const char *save_error_s = nullptr; - std::string save_error_name; + auto save_is_empty = false; + const char *save_error_s = nullptr; + std::string save_error_name; - auto it = ope.opes_.begin(); - while (it != ope.opes_.end()) { - (*it)->accept(*this); - if (!is_empty) { - ++it; - while (it != ope.opes_.end()) { - DetectInfiniteLoop vis(refs_, has_error_cache_); - (*it)->accept(vis); - if (vis.has_error) { - is_empty = true; - error_s = vis.error_s; - error_name = vis.error_name; - } - ++it; - } - return; + auto it = ope.opes_.begin(); + while (it != ope.opes_.end()) { + (*it)->accept(*this); + if (!is_empty) { + ++it; + while (it != ope.opes_.end()) { + DetectInfiniteLoop vis(refs_, has_error_cache_); + (*it)->accept(vis); + if (vis.has_error) { + is_empty = true; + error_s = vis.error_s; + error_name = vis.error_name; } - - save_is_empty = is_empty; - save_error_s = error_s; - save_error_name = error_name; - - is_empty = false; - error_name.clear(); ++it; + } + return; } - is_empty = save_is_empty; - error_s = save_error_s; - error_name = save_error_name; + save_is_empty = is_empty; + save_error_s = error_s; + save_error_name = error_name; + + is_empty = false; + error_name.clear(); + ++it; + } + + is_empty = save_is_empty; + error_s = save_error_s; + error_name = save_error_name; } inline void HasEmptyElement::visit(Reference &ope) { - auto it = std::find_if(refs_.begin(), refs_.end(), - [&](const std::pair &ref) { - return ope.name_ == ref.second; - }); - if (it != refs_.end()) { return; } + auto it = std::find_if(refs_.begin(), refs_.end(), + [&](const std::pair &ref) { + return ope.name_ == ref.second; + }); + if (it != refs_.end()) { return; } - if (ope.rule_) { - refs_.emplace_back(ope.s_, ope.name_); - ope.rule_->accept(*this); - refs_.pop_back(); - } + if (ope.rule_) { + refs_.emplace_back(ope.s_, ope.name_); + ope.rule_->accept(*this); + refs_.pop_back(); + } } inline void DetectInfiniteLoop::visit(Reference &ope) { - auto it1 = std::find_if(refs_.begin(), refs_.end(), - [&](const std::pair &ref) { - return ope.name_ == ref.second; - }); - if (it1 != refs_.end()) { return; } + auto it = std::find_if(refs_.begin(), refs_.end(), + [&](const std::pair &ref) { + return ope.name_ == ref.second; + }); + if (it != refs_.end()) { return; } - if (ope.rule_) { - auto it = has_error_cache_.find(ope.name_); - if (it != has_error_cache_.end()) { - has_error = it->second; - } else { - refs_.emplace_back(ope.s_, ope.name_); - ope.rule_->accept(*this); - refs_.pop_back(); - has_error_cache_[ope.name_] = has_error; - } + if (ope.rule_) { + auto it = has_error_cache_.find(ope.name_); + if (it != has_error_cache_.end()) { + has_error = it->second; + } else { + refs_.emplace_back(ope.s_, ope.name_); + ope.rule_->accept(*this); + refs_.pop_back(); + has_error_cache_[ope.name_] = has_error; } + } - if (ope.is_macro_) { - for (auto arg : ope.args_) { - arg->accept(*this); - } + if (ope.is_macro_) { + for (const auto &arg : ope.args_) { + arg->accept(*this); } + } } inline void ReferenceChecker::visit(Reference &ope) { - auto it = std::find(params_.begin(), params_.end(), ope.name_); - if (it != params_.end()) { return; } + auto it = std::find(params_.begin(), params_.end(), ope.name_); + if (it != params_.end()) { return; } - if (!grammar_.count(ope.name_)) { + if (!grammar_.count(ope.name_)) { + error_s[ope.name_] = ope.s_; + error_message[ope.name_] = "'" + ope.name_ + "' is not defined."; + } else { + if (!referenced.count(ope.name_)) { referenced.insert(ope.name_); } + const auto &rule = grammar_.at(ope.name_); + if (rule.is_macro) { + if (!ope.is_macro_ || ope.args_.size() != rule.params.size()) { error_s[ope.name_] = ope.s_; - error_message[ope.name_] = "'" + ope.name_ + "' is not defined."; - } else { - if (!referenced.count(ope.name_)) { referenced.insert(ope.name_); } - const auto &rule = grammar_.at(ope.name_); - if (rule.is_macro) { - if (!ope.is_macro_ || ope.args_.size() != rule.params.size()) { - error_s[ope.name_] = ope.s_; - error_message[ope.name_] = "incorrect number of arguments."; - } - } else if (ope.is_macro_) { - error_s[ope.name_] = ope.s_; - error_message[ope.name_] = "'" + ope.name_ + "' is not macro."; - } - for (auto arg : ope.args_) { - arg->accept(*this); - } + error_message[ope.name_] = "incorrect number of arguments."; + } + } else if (ope.is_macro_) { + error_s[ope.name_] = ope.s_; + error_message[ope.name_] = "'" + ope.name_ + "' is not macro."; } + for (const auto &arg : ope.args_) { + arg->accept(*this); + } + } +} + +inline void ComputeFirstSet::visit(Reference &ope) { + if (!ope.rule_) { + // Macro parameter reference — can't predict what it will match + result_.any_char = true; + return; + } + if (refs_.count(ope.name_)) { return; } + refs_.insert(ope.name_); + ope.rule_->accept(*this); + if (!result_.first_rule && ope.rule_->is_token()) { + result_.first_rule = ope.rule_; + } + refs_.erase(ope.name_); +} + +inline void SetupFirstSets::visit(Reference &ope) { + if (!ope.rule_ || refs_.count(ope.name_)) { return; } + refs_.insert(ope.name_); + ope.rule_->accept(*this); + refs_.erase(ope.name_); +} + +inline void SetupFirstSets::visit(Sequence &ope) { + ope.kw_guard_.reset(); + setup_keyword_guarded_identifier(ope); + for (const auto &op : ope.opes_) { + op->accept(*this); + } +} + +inline void SetupFirstSets::setup_keyword_guarded_identifier(Sequence &seq) { + // Detect pattern: NotPredicate(Reference→PrioritizedChoice) + // TokenBoundary(Sequence[CharacterClass, + // Repetition(CharacterClass)]) + // This is the pattern used by: PlainIdentifier <- !ReservedKeyword + // <[a-z_]i[a-z0-9_]i*> + if (seq.opes_.size() != 2) { return; } + + // Child 0 must be NotPredicate + auto *not_pred = dynamic_cast(seq.opes_[0].get()); + if (!not_pred) { return; } + + // NotPredicate's child must be Reference to a rule + auto *ref = dynamic_cast(not_pred->ope_.get()); + if (!ref || !ref->rule_) { return; } + + // The referenced rule's inner operator (Holder) must contain + // PrioritizedChoice + auto *holder = dynamic_cast(ref->get_core_operator().get()); + if (!holder) { return; } + auto *choice = dynamic_cast(holder->ope_.get()); + if (!choice) { return; } + + // Extract keywords from PrioritizedChoice alternatives + std::vector exact_keywords; + std::vector prefix_keywords; + + for (const auto &alt : choice->opes_) { + auto *lit = dynamic_cast(alt.get()); + if (lit) { + if (!lit->ignore_case_) { return; } + exact_keywords.push_back(to_lower(lit->lit_)); + continue; + } + // Check for compound keyword (Sequence of LiteralStrings) + auto *sub_seq = dynamic_cast(alt.get()); + if (sub_seq && !sub_seq->opes_.empty()) { + auto *first_lit = dynamic_cast(sub_seq->opes_[0].get()); + if (first_lit) { + auto all_ignore_case_lits = + std::all_of(sub_seq->opes_.begin(), sub_seq->opes_.end(), + [](const auto &child) { + auto *l = dynamic_cast(child.get()); + return l && l->ignore_case_; + }); + if (all_ignore_case_lits) { + prefix_keywords.push_back(to_lower(first_lit->lit_)); + continue; + } + } + } + // Unrecognized alternative — bail out + return; + } + + if (exact_keywords.empty()) { return; } + + // Child 1 must be TokenBoundary + auto *tb = dynamic_cast(seq.opes_[1].get()); + if (!tb) { return; } + + // TokenBoundary content: Sequence[CharacterClass, Repetition(CharacterClass)] + // or just CharacterClass (single char identifier) + CharacterClass *first_cc = nullptr; + CharacterClass *rest_cc = nullptr; + + auto *inner_seq = dynamic_cast(tb->ope_.get()); + if (inner_seq && inner_seq->opes_.size() == 2) { + first_cc = dynamic_cast(inner_seq->opes_[0].get()); + auto *rep = dynamic_cast(inner_seq->opes_[1].get()); + if (rep) { rest_cc = dynamic_cast(rep->ope_.get()); } + } + + if (!first_cc || !rest_cc) { return; } + if (!first_cc->is_ascii_only() || !rest_cc->is_ascii_only()) { return; } + + // All conditions met — set up the fast path + auto kw = std::make_unique(); + kw->identifier_first = first_cc->ascii_bitset(); + kw->identifier_rest = rest_cc->ascii_bitset(); + + // Compute keyword length range for early-out in hot path + size_t min_len = SIZE_MAX, max_len = 0; + for (const auto &k : exact_keywords) { + min_len = std::min(min_len, k.size()); + max_len = std::max(max_len, k.size()); + } + for (const auto &k : prefix_keywords) { + min_len = std::min(min_len, k.size()); + max_len = std::max(max_len, k.size()); + } + kw->min_keyword_len = min_len; + kw->max_keyword_len = max_len; + + kw->exact_keywords = std::move(exact_keywords); + kw->prefix_keywords = std::move(prefix_keywords); + seq.kw_guard_ = std::move(kw); +} + +// Compute which rules benefit from packrat memoization. +// A rule benefits if it's reachable from 2+ alternatives of the same +// PrioritizedChoice (backtracking will re-visit it at the same position). +inline void Definition::initialize_packrat_filter() const { + std::call_once(packrat_filter_init_, [&]() { + auto def_count = definition_ids_.size(); + if (def_count == 0) { return; } + + // Collect rule IDs reachable from an Ope subtree (bitvector indexed by + // def_id) + struct CollectReachableRules : public TraversalVisitor { + using TraversalVisitor::visit; + std::vector reachable; // indexed by def_id + + CollectReachableRules(size_t n) : reachable(n, false) {} + + void visit(Holder &ope) override { + auto id = ope.outer_->id; + if (id < reachable.size()) { reachable[id] = true; } + ope.ope_->accept(*this); + } + void visit(Reference &ope) override { + if (ope.rule_ && ope.rule_->id < reachable.size() && + !reachable[ope.rule_->id]) { + reachable[ope.rule_->id] = true; + ope.rule_->accept(*this); + } + } + }; + + // Find rules that benefit: reachable from 2+ alternatives of same choice + std::vector benefits(def_count, false); + + struct FindBacktrackRules : public TraversalVisitor { + using TraversalVisitor::visit; + std::vector &benefits; + size_t def_count; + std::vector visited_rules; // indexed by def_id + + FindBacktrackRules(std::vector &b, size_t n) + : benefits(b), def_count(n), visited_rules(n, false) {} + + void visit(PrioritizedChoice &ope) override { + // For each alternative, collect reachable rules as bitvectors + std::vector> alt_reachable; + for (auto &op : ope.opes_) { + CollectReachableRules crr(def_count); + op->accept(crr); + alt_reachable.push_back(std::move(crr.reachable)); + } + + // Mark rules reachable from 2+ alternatives + for (size_t id = 0; id < def_count; id++) { + size_t count = 0; + for (auto &alt : alt_reachable) { + if (alt[id]) { count++; } + } + if (count >= 2) { benefits[id] = true; } + } + + // Recurse into alternatives + for (auto &op : ope.opes_) { + op->accept(*this); + } + } + void visit(Holder &ope) override { + auto id = ope.outer_->id; + if (id < visited_rules.size() && !visited_rules[id]) { + visited_rules[id] = true; + ope.ope_->accept(*this); + } + } + void visit(Reference &ope) override { + if (ope.rule_) { ope.rule_->accept(*this); } + } + }; + + FindBacktrackRules finder(benefits, def_count); + holder_->accept(finder); + if (whitespaceOpe) { whitespaceOpe->accept(finder); } + if (wordOpe) { wordOpe->accept(finder); } + + packrat_filter_ = std::move(benefits); + }); } inline void LinkReferences::visit(Reference &ope) { - // Check if the reference is a macro parameter - auto found_param = false; - for (size_t i = 0; i < params_.size(); i++) { - const auto ¶m = params_[i]; - if (param == ope.name_) { - ope.iarg_ = i; - found_param = true; - break; - } + // Check if the reference is a macro parameter + auto found_param = false; + for (size_t i = 0; i < params_.size(); i++) { + const auto ¶m = params_[i]; + if (param == ope.name_) { + ope.iarg_ = i; + found_param = true; + break; } + } - // Check if the reference is a definition rule - if (!found_param && grammar_.count(ope.name_)) { - auto &rule = grammar_.at(ope.name_); - ope.rule_ = &rule; - } + // Check if the reference is a definition rule + if (!found_param && grammar_.count(ope.name_)) { + auto &rule = grammar_.at(ope.name_); + ope.rule_ = &rule; + } - for (auto arg : ope.args_) { - arg->accept(*this); - } + for (const auto &arg : ope.args_) { + arg->accept(*this); + } } inline void FindReference::visit(Reference &ope) { - for (size_t i = 0; i < args_.size(); i++) { - const auto &name = params_[i]; - if (name == ope.name_) { - found_ope = args_[i]; - return; - } + for (size_t i = 0; i < args_.size(); i++) { + const auto &name = params_[i]; + if (name == ope.name_) { + found_ope = args_[i]; + return; } - found_ope = ope.shared_from_this(); + } + found_ope = ope.shared_from_this(); } /*----------------------------------------------------------------------------- @@ -3298,955 +4057,1011 @@ using Rules = std::unordered_map>; class ParserGenerator { public: - struct ParserContext { - std::shared_ptr grammar; - std::string start; - bool enablePackratParsing = false; - }; + struct ParserContext { + std::shared_ptr grammar; + std::string start; + bool enablePackratParsing = false; + }; - static ParserContext parse(const char *s, size_t n, const Rules &rules, - Log log, std::string_view start) { - return get_instance().perform_core(s, n, rules, log, std::string(start)); - } + static ParserContext parse(const char *s, size_t n, const Rules &rules, + Log log, std::string_view start, + bool enable_left_recursion = true) { + return get_instance().perform_core(s, n, rules, log, std::string(start), + enable_left_recursion); + } - // For debugging purpose - static bool parse_test(const char *d, const char *s) { - Data data; - std::any dt = &data; + // For debugging purpose + static bool parse_test(const char *d, const char *s) { + Data data; + std::any dt = &data; - auto n = strlen(s); - auto r = get_instance().g[d].parse(s, n, dt); - return r.ret && r.len == n; - } + auto n = strlen(s); + auto r = get_instance().g[d].parse(s, n, dt); + return r.ret && r.len == n; + } #if defined(__cpp_lib_char8_t) - static bool parse_test(const char *d, const char8_t *s) { - return parse_test(d, reinterpret_cast(s)); - } + static bool parse_test(const char *d, const char8_t *s) { + return parse_test(d, reinterpret_cast(s)); + } #endif private: - static ParserGenerator &get_instance() { - static ParserGenerator instance; - return instance; - } + static ParserGenerator &get_instance() { + static ParserGenerator instance; + return instance; + } - ParserGenerator() { - make_grammar(); - setup_actions(); - } + ParserGenerator() { + make_grammar(); + setup_actions(); + } - struct Instruction { - std::string type; - std::any data; - std::string_view sv; + struct Instruction { + std::string type; + std::any data; + std::string_view sv; + }; + + struct Data { + std::shared_ptr grammar; + std::string start; + const char *start_pos = nullptr; + + std::vector> duplicates_of_definition; + + std::vector> duplicates_of_instruction; + std::map> instructions; + + std::vector> undefined_back_references; + std::vector> captures_stack{{}}; + + std::set captures_in_current_definition; + bool enablePackratParsing = true; + + Data() : grammar(std::make_shared()) {} + }; + + class SyntaxErrorException : public std::runtime_error { + public: + SyntaxErrorException(const char *what_arg, std::pair r) + : std::runtime_error(what_arg), r_(r) {} + + std::pair line_info() const { return r_; } + + private: + std::pair r_; + }; + + void make_grammar() { + // Setup PEG syntax parser + g["Grammar"] <= seq(g["Spacing"], oom(g["Definition"]), g["EndOfFile"]); + g["Definition"] <= + cho(seq(g["Ignore"], g["IdentCont"], g["Parameters"], g["LEFTARROW"], + g["Expression"], opt(g["Instruction"])), + seq(g["Ignore"], g["Identifier"], g["LEFTARROW"], g["Expression"], + opt(g["Instruction"]))); + g["Expression"] <= seq(g["Sequence"], zom(seq(g["SLASH"], g["Sequence"]))); + g["Sequence"] <= zom(cho(g["CUT"], g["Prefix"])); + g["Prefix"] <= seq(opt(cho(g["AND"], g["NOT"])), g["SuffixWithLabel"]); + g["SuffixWithLabel"] <= + seq(g["Suffix"], opt(seq(g["LABEL"], g["Identifier"]))); + g["Suffix"] <= seq(g["Primary"], opt(g["Loop"])); + g["Loop"] <= cho(g["QUESTION"], g["STAR"], g["PLUS"], g["Repetition"]); + g["Primary"] <= cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"], + npd(g["LEFTARROW"])), + seq(g["Ignore"], g["Identifier"], + npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))), + seq(g["OPEN"], g["Expression"], g["CLOSE"]), + seq(g["BeginTok"], g["Expression"], g["EndTok"]), + g["CapScope"], + seq(g["BeginCap"], g["Expression"], g["EndCap"]), + g["BackRef"], g["DictionaryI"], g["LiteralI"], + g["Dictionary"], g["Literal"], g["NegatedClassI"], + g["NegatedClass"], g["ClassI"], g["Class"], g["DOT"]); + + g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); + g["IdentCont"] <= tok(seq(g["IdentStart"], zom(g["IdentRest"]))); + + const static std::vector> range = { + {0x0080, 0xFFFF}}; + g["IdentStart"] <= seq(npd(lit(u8(u8"↑"))), npd(lit(u8(u8"⇑"))), + cho(cls("a-zA-Z_%"), cls(range))); + + g["IdentRest"] <= cho(g["IdentStart"], cls("0-9")); + + g["Dictionary"] <= seq(g["LiteralD"], oom(seq(g["PIPE"], g["LiteralD"]))); + + g["DictionaryI"] <= + seq(g["LiteralID"], oom(seq(g["PIPE"], g["LiteralID"]))); + + auto lit_ope = cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), + cls("'"), g["Spacing"]), + seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), + cls("\""), g["Spacing"])); + g["Literal"] <= lit_ope; + g["LiteralD"] <= lit_ope; + + auto lit_case_ignore_ope = + cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), lit("'i"), + g["Spacing"]), + seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), lit("\"i"), + g["Spacing"])); + g["LiteralI"] <= lit_case_ignore_ope; + g["LiteralID"] <= lit_case_ignore_ope; + + // NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'. + g["Class"] <= seq(chr('['), npd(chr('^')), + tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), + g["Spacing"]); + g["ClassI"] <= seq(chr('['), npd(chr('^')), + tok(oom(seq(npd(chr(']')), g["Range"]))), lit("]i"), + g["Spacing"]); + + g["NegatedClass"] <= seq(lit("[^"), + tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), + g["Spacing"]); + g["NegatedClassI"] <= seq(lit("[^"), + tok(oom(seq(npd(chr(']')), g["Range"]))), + lit("]i"), g["Spacing"]); + + // NOTE: This is different from The original Brian Ford's paper, and this + // modification allows us to specify `[+-]` as a valid char class. + g["Range"] <= + cho(seq(g["Char"], chr('-'), npd(chr(']')), g["Char"]), g["Char"]); + + g["Char"] <= + cho(seq(chr('\\'), cls("fnrtv'\"[]\\^-")), + seq(chr('\\'), cls("0-3"), cls("0-7"), cls("0-7")), + seq(chr('\\'), cls("0-7"), opt(cls("0-7"))), + seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))), + seq(lit("\\u"), + cho(seq(cho(seq(chr('0'), cls("0-9a-fA-F")), lit("10")), + rep(cls("0-9a-fA-F"), 4, 4)), + rep(cls("0-9a-fA-F"), 4, 5))), + seq(npd(chr('\\')), dot())); + + g["Repetition"] <= + seq(g["BeginBracket"], g["RepetitionRange"], g["EndBracket"]); + g["RepetitionRange"] <= cho(seq(g["Number"], g["COMMA"], g["Number"]), + seq(g["Number"], g["COMMA"]), g["Number"], + seq(g["COMMA"], g["Number"])); + g["Number"] <= seq(oom(cls("0-9")), g["Spacing"]); + + g["CapScope"] <= seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]); + + g["LEFTARROW"] <= seq(cho(lit("<-"), lit(u8(u8"←"))), g["Spacing"]); + ~g["SLASH"] <= seq(chr('/'), g["Spacing"]); + ~g["PIPE"] <= seq(chr('|'), g["Spacing"]); + g["AND"] <= seq(chr('&'), g["Spacing"]); + g["NOT"] <= seq(chr('!'), g["Spacing"]); + g["QUESTION"] <= seq(chr('?'), g["Spacing"]); + g["STAR"] <= seq(chr('*'), g["Spacing"]); + g["PLUS"] <= seq(chr('+'), g["Spacing"]); + ~g["OPEN"] <= seq(chr('('), g["Spacing"]); + ~g["CLOSE"] <= seq(chr(')'), g["Spacing"]); + g["DOT"] <= seq(chr('.'), g["Spacing"]); + + g["CUT"] <= seq(lit(u8(u8"↑")), g["Spacing"]); + ~g["LABEL"] <= seq(cho(chr('^'), lit(u8(u8"⇑"))), g["Spacing"]); + + ~g["Spacing"] <= zom(cho(g["Space"], g["Comment"])); + g["Comment"] <= seq(chr('#'), zom(seq(npd(g["EndOfLine"]), dot())), + opt(g["EndOfLine"])); + g["Space"] <= cho(chr(' '), chr('\t'), g["EndOfLine"]); + g["EndOfLine"] <= cho(lit("\r\n"), chr('\n'), chr('\r')); + g["EndOfFile"] <= npd(dot()); + + ~g["BeginTok"] <= seq(chr('<'), g["Spacing"]); + ~g["EndTok"] <= seq(chr('>'), g["Spacing"]); + + ~g["BeginCapScope"] <= seq(chr('$'), chr('('), g["Spacing"]); + ~g["EndCapScope"] <= seq(chr(')'), g["Spacing"]); + + g["BeginCap"] <= seq(chr('$'), tok(g["IdentCont"]), chr('<'), g["Spacing"]); + ~g["EndCap"] <= seq(chr('>'), g["Spacing"]); + + g["BackRef"] <= seq(chr('$'), tok(g["IdentCont"]), g["Spacing"]); + + g["IGNORE"] <= chr('~'); + + g["Ignore"] <= opt(g["IGNORE"]); + g["Parameters"] <= seq(g["OPEN"], g["Identifier"], + zom(seq(g["COMMA"], g["Identifier"])), g["CLOSE"]); + g["Arguments"] <= seq(g["OPEN"], g["Expression"], + zom(seq(g["COMMA"], g["Expression"])), g["CLOSE"]); + ~g["COMMA"] <= seq(chr(','), g["Spacing"]); + + // Instruction grammars + g["Instruction"] <= + seq(g["BeginBracket"], + opt(seq(g["InstructionItem"], zom(seq(g["InstructionItemSeparator"], + g["InstructionItem"])))), + g["EndBracket"]); + g["InstructionItem"] <= + cho(g["PrecedenceClimbing"], g["ErrorMessage"], g["NoAstOpt"]); + ~g["InstructionItemSeparator"] <= seq(chr(';'), g["Spacing"]); + + ~g["SpacesZom"] <= zom(g["Space"]); + ~g["SpacesOom"] <= oom(g["Space"]); + ~g["BeginBracket"] <= seq(chr('{'), g["Spacing"]); + ~g["EndBracket"] <= seq(chr('}'), g["Spacing"]); + + // PrecedenceClimbing instruction + g["PrecedenceClimbing"] <= + seq(lit("precedence"), g["SpacesOom"], g["PrecedenceInfo"], + zom(seq(g["SpacesOom"], g["PrecedenceInfo"])), g["SpacesZom"]); + g["PrecedenceInfo"] <= + seq(g["PrecedenceAssoc"], + oom(seq(ign(g["SpacesOom"]), g["PrecedenceOpe"]))); + g["PrecedenceOpe"] <= + cho(seq(cls("'"), + tok(zom(seq(npd(cho(g["Space"], cls("'"))), g["Char"]))), + cls("'")), + seq(cls("\""), + tok(zom(seq(npd(cho(g["Space"], cls("\""))), g["Char"]))), + cls("\"")), + tok(oom(seq(npd(cho(g["PrecedenceAssoc"], g["Space"], chr('}'))), + dot())))); + g["PrecedenceAssoc"] <= cls("LR"); + + // Error message instruction + g["ErrorMessage"] <= seq(lit("error_message"), g["SpacesOom"], + g["LiteralD"], g["SpacesZom"]); + + // No Ast node optimization instruction + g["NoAstOpt"] <= seq(lit("no_ast_opt"), g["SpacesZom"]); + + // Set definition names + for (auto &x : g) { + x.second.name = x.first; + } + } + + void setup_actions() { + g["Definition"] = [&](const SemanticValues &vs, std::any &dt) { + auto &data = *std::any_cast(dt); + + auto is_macro = vs.choice() == 0; + auto ignore = std::any_cast(vs[0]); + auto name = std::any_cast(vs[1]); + + std::vector params; + std::shared_ptr ope; + auto has_instructions = false; + + if (is_macro) { + params = std::any_cast>(vs[2]); + ope = std::any_cast>(vs[4]); + if (vs.size() == 6) { has_instructions = true; } + } else { + ope = std::any_cast>(vs[3]); + if (vs.size() == 5) { has_instructions = true; } + } + + if (has_instructions) { + auto index = is_macro ? 5 : 4; + std::unordered_set types; + for (const auto &instruction : + std::any_cast>(vs[index])) { + const auto &type = instruction.type; + if (types.find(type) == types.end()) { + data.instructions[name].push_back(instruction); + types.insert(instruction.type); + if (type == "declare_symbol" || type == "check_symbol") { + if (!TokenChecker::is_token(*ope)) { ope = tok(ope); } + } + } else { + data.duplicates_of_instruction.emplace_back(type, + instruction.sv.data()); + } + } + } + + auto &grammar = *data.grammar; + if (!grammar.count(name)) { + auto &rule = grammar[name]; + rule <= ope; + rule.name = name; + rule.s_ = vs.sv().data(); + rule.line_ = line_info(vs.ss, rule.s_); + rule.ignoreSemanticValue = ignore; + rule.is_macro = is_macro; + rule.params = params; + + if (data.start.empty()) { + data.start = rule.name; + data.start_pos = rule.s_; + } + } else { + data.duplicates_of_definition.emplace_back(name, vs.sv().data()); + } }; - struct Data { - std::shared_ptr grammar; - std::string start; - const char *start_pos = nullptr; - - std::vector> duplicates_of_definition; - - std::vector> duplicates_of_instruction; - std::map> instructions; - - std::vector> undefined_back_references; - std::vector> captures_stack{{}}; - - std::set captures_in_current_definition; - bool enablePackratParsing = true; - - Data() : grammar(std::make_shared()) {} + g["Definition"].enter = [](const Context & /*c*/, const char * /*s*/, + size_t /*n*/, std::any &dt) { + auto &data = *std::any_cast(dt); + data.captures_in_current_definition.clear(); }; - void make_grammar() { - // Setup PEG syntax parser - g["Grammar"] <= seq(g["Spacing"], oom(g["Definition"]), g["EndOfFile"]); - g["Definition"] <= - cho(seq(g["Ignore"], g["IdentCont"], g["Parameters"], g["LEFTARROW"], - g["Expression"], opt(g["Instruction"])), - seq(g["Ignore"], g["Identifier"], g["LEFTARROW"], g["Expression"], - opt(g["Instruction"]))); - g["Expression"] <= seq(g["Sequence"], zom(seq(g["SLASH"], g["Sequence"]))); - g["Sequence"] <= zom(cho(g["CUT"], g["Prefix"])); - g["Prefix"] <= seq(opt(cho(g["AND"], g["NOT"])), g["SuffixWithLabel"]); - g["SuffixWithLabel"] <= - seq(g["Suffix"], opt(seq(g["LABEL"], g["Identifier"]))); - g["Suffix"] <= seq(g["Primary"], opt(g["Loop"])); - g["Loop"] <= cho(g["QUESTION"], g["STAR"], g["PLUS"], g["Repetition"]); - g["Primary"] <= cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"], - npd(g["LEFTARROW"])), - seq(g["Ignore"], g["Identifier"], - npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))), - seq(g["OPEN"], g["Expression"], g["CLOSE"]), - seq(g["BeginTok"], g["Expression"], g["EndTok"]), - g["CapScope"], - seq(g["BeginCap"], g["Expression"], g["EndCap"]), - g["BackRef"], g["DictionaryI"], g["LiteralI"], - g["Dictionary"], g["Literal"], g["NegatedClassI"], - g["NegatedClass"], g["ClassI"], g["Class"], g["DOT"]); - - g["Identifier"] <= seq(g["IdentCont"], g["Spacing"]); - g["IdentCont"] <= tok(seq(g["IdentStart"], zom(g["IdentRest"]))); - - const static std::vector> range = { - {0x0080, 0xFFFF}}; - g["IdentStart"] <= seq(npd(lit(u8(u8"↑"))), npd(lit(u8(u8"⇑"))), - cho(cls("a-zA-Z_%"), cls(range))); - - g["IdentRest"] <= cho(g["IdentStart"], cls("0-9")); - - g["Dictionary"] <= seq(g["LiteralD"], oom(seq(g["PIPE"], g["LiteralD"]))); - - g["DictionaryI"] <= - seq(g["LiteralID"], oom(seq(g["PIPE"], g["LiteralID"]))); - - auto lit_ope = cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), - cls("'"), g["Spacing"]), - seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), - cls("\""), g["Spacing"])); - g["Literal"] <= lit_ope; - g["LiteralD"] <= lit_ope; - - auto lit_case_ignore_ope = - cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), lit("'i"), - g["Spacing"]), - seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), lit("\"i"), - g["Spacing"])); - g["LiteralI"] <= lit_case_ignore_ope; - g["LiteralID"] <= lit_case_ignore_ope; - - // NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'. - g["Class"] <= seq(chr('['), npd(chr('^')), - tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), - g["Spacing"]); - g["ClassI"] <= seq(chr('['), npd(chr('^')), - tok(oom(seq(npd(chr(']')), g["Range"]))), lit("]i"), - g["Spacing"]); - - g["NegatedClass"] <= seq(lit("[^"), - tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), - g["Spacing"]); - g["NegatedClassI"] <= seq(lit("[^"), - tok(oom(seq(npd(chr(']')), g["Range"]))), - lit("]i"), g["Spacing"]); - - // NOTE: This is different from The original Brian Ford's paper, and this - // modification allows us to specify `[+-]` as a valid char class. - g["Range"] <= - cho(seq(g["Char"], chr('-'), npd(chr(']')), g["Char"]), g["Char"]); - - g["Char"] <= - cho(seq(chr('\\'), cls("fnrtv'\"[]\\^")), - seq(chr('\\'), cls("0-3"), cls("0-7"), cls("0-7")), - seq(chr('\\'), cls("0-7"), opt(cls("0-7"))), - seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))), - seq(lit("\\u"), - cho(seq(cho(seq(chr('0'), cls("0-9a-fA-F")), lit("10")), - rep(cls("0-9a-fA-F"), 4, 4)), - rep(cls("0-9a-fA-F"), 4, 5))), - seq(npd(chr('\\')), dot())); - - g["Repetition"] <= - seq(g["BeginBracket"], g["RepetitionRange"], g["EndBracket"]); - g["RepetitionRange"] <= cho(seq(g["Number"], g["COMMA"], g["Number"]), - seq(g["Number"], g["COMMA"]), g["Number"], - seq(g["COMMA"], g["Number"])); - g["Number"] <= seq(oom(cls("0-9")), g["Spacing"]); - - g["CapScope"] <= seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]); - - g["LEFTARROW"] <= seq(cho(lit("<-"), lit(u8(u8"←"))), g["Spacing"]); - ~g["SLASH"] <= seq(chr('/'), g["Spacing"]); - ~g["PIPE"] <= seq(chr('|'), g["Spacing"]); - g["AND"] <= seq(chr('&'), g["Spacing"]); - g["NOT"] <= seq(chr('!'), g["Spacing"]); - g["QUESTION"] <= seq(chr('?'), g["Spacing"]); - g["STAR"] <= seq(chr('*'), g["Spacing"]); - g["PLUS"] <= seq(chr('+'), g["Spacing"]); - ~g["OPEN"] <= seq(chr('('), g["Spacing"]); - ~g["CLOSE"] <= seq(chr(')'), g["Spacing"]); - g["DOT"] <= seq(chr('.'), g["Spacing"]); - - g["CUT"] <= seq(lit(u8(u8"↑")), g["Spacing"]); - ~g["LABEL"] <= seq(cho(chr('^'), lit(u8(u8"⇑"))), g["Spacing"]); - - ~g["Spacing"] <= zom(cho(g["Space"], g["Comment"])); - g["Comment"] <= - seq(chr('#'), zom(seq(npd(g["EndOfLine"]), dot())), g["EndOfLine"]); - g["Space"] <= cho(chr(' '), chr('\t'), g["EndOfLine"]); - g["EndOfLine"] <= cho(lit("\r\n"), chr('\n'), chr('\r')); - g["EndOfFile"] <= npd(dot()); - - ~g["BeginTok"] <= seq(chr('<'), g["Spacing"]); - ~g["EndTok"] <= seq(chr('>'), g["Spacing"]); - - ~g["BeginCapScope"] <= seq(chr('$'), chr('('), g["Spacing"]); - ~g["EndCapScope"] <= seq(chr(')'), g["Spacing"]); - - g["BeginCap"] <= seq(chr('$'), tok(g["IdentCont"]), chr('<'), g["Spacing"]); - ~g["EndCap"] <= seq(chr('>'), g["Spacing"]); - - g["BackRef"] <= seq(chr('$'), tok(g["IdentCont"]), g["Spacing"]); - - g["IGNORE"] <= chr('~'); - - g["Ignore"] <= opt(g["IGNORE"]); - g["Parameters"] <= seq(g["OPEN"], g["Identifier"], - zom(seq(g["COMMA"], g["Identifier"])), g["CLOSE"]); - g["Arguments"] <= seq(g["OPEN"], g["Expression"], - zom(seq(g["COMMA"], g["Expression"])), g["CLOSE"]); - ~g["COMMA"] <= seq(chr(','), g["Spacing"]); - - // Instruction grammars - g["Instruction"] <= - seq(g["BeginBracket"], - opt(seq(g["InstructionItem"], zom(seq(g["InstructionItemSeparator"], - g["InstructionItem"])))), - g["EndBracket"]); - g["InstructionItem"] <= - cho(g["PrecedenceClimbing"], g["ErrorMessage"], g["NoAstOpt"]); - ~g["InstructionItemSeparator"] <= seq(chr(';'), g["Spacing"]); - - ~g["SpacesZom"] <= zom(g["Space"]); - ~g["SpacesOom"] <= oom(g["Space"]); - ~g["BeginBracket"] <= seq(chr('{'), g["Spacing"]); - ~g["EndBracket"] <= seq(chr('}'), g["Spacing"]); - - // PrecedenceClimbing instruction - g["PrecedenceClimbing"] <= - seq(lit("precedence"), g["SpacesOom"], g["PrecedenceInfo"], - zom(seq(g["SpacesOom"], g["PrecedenceInfo"])), g["SpacesZom"]); - g["PrecedenceInfo"] <= - seq(g["PrecedenceAssoc"], - oom(seq(ign(g["SpacesOom"]), g["PrecedenceOpe"]))); - g["PrecedenceOpe"] <= - cho(seq(cls("'"), - tok(zom(seq(npd(cho(g["Space"], cls("'"))), g["Char"]))), - cls("'")), - seq(cls("\""), - tok(zom(seq(npd(cho(g["Space"], cls("\""))), g["Char"]))), - cls("\"")), - tok(oom(seq(npd(cho(g["PrecedenceAssoc"], g["Space"], chr('}'))), - dot())))); - g["PrecedenceAssoc"] <= cls("LR"); - - // Error message instruction - g["ErrorMessage"] <= seq(lit("error_message"), g["SpacesOom"], - g["LiteralD"], g["SpacesZom"]); - - // No Ast node optimization instruction - g["NoAstOpt"] <= seq(lit("no_ast_opt"), g["SpacesZom"]); - - // Set definition names - for (auto &x : g) { - x.second.name = x.first; + g["Expression"] = [&](const SemanticValues &vs) { + if (vs.size() == 1) { + return std::any_cast>(vs[0]); + } else { + std::vector> opes; + for (auto i = 0u; i < vs.size(); i++) { + opes.emplace_back(std::any_cast>(vs[i])); } - } + const std::shared_ptr ope = + std::make_shared(opes); + return ope; + } + }; - void setup_actions() { - g["Definition"] = [&](const SemanticValues &vs, std::any &dt) { - auto &data = *std::any_cast(dt); - - auto is_macro = vs.choice() == 0; - auto ignore = std::any_cast(vs[0]); - auto name = std::any_cast(vs[1]); - - std::vector params; - std::shared_ptr ope; - auto has_instructions = false; - - if (is_macro) { - params = std::any_cast>(vs[2]); - ope = std::any_cast>(vs[4]); - if (vs.size() == 6) { has_instructions = true; } - } else { - ope = std::any_cast>(vs[3]); - if (vs.size() == 5) { has_instructions = true; } - } - - if (has_instructions) { - auto index = is_macro ? 5 : 4; - std::unordered_set types; - for (const auto &instruction : - std::any_cast>(vs[index])) { - const auto &type = instruction.type; - if (types.find(type) == types.end()) { - data.instructions[name].push_back(instruction); - types.insert(instruction.type); - if (type == "declare_symbol" || type == "check_symbol") { - if (!TokenChecker::is_token(*ope)) { ope = tok(ope); } - } - } else { - data.duplicates_of_instruction.emplace_back(type, - instruction.sv.data()); - } - } - } - - auto &grammar = *data.grammar; - if (!grammar.count(name)) { - auto &rule = grammar[name]; - rule <= ope; - rule.name = name; - rule.s_ = vs.sv().data(); - rule.line_ = line_info(vs.ss, rule.s_); - rule.ignoreSemanticValue = ignore; - rule.is_macro = is_macro; - rule.params = params; - - if (data.start.empty()) { - data.start = rule.name; - data.start_pos = rule.s_; - } - } else { - data.duplicates_of_definition.emplace_back(name, vs.sv().data()); - } - }; - - g["Definition"].enter = [](const Context & /*c*/, const char * /*s*/, - size_t /*n*/, std::any &dt) { - auto &data = *std::any_cast(dt); - data.captures_in_current_definition.clear(); - }; - - g["Expression"] = [&](const SemanticValues &vs) { - if (vs.size() == 1) { - return std::any_cast>(vs[0]); - } else { - std::vector> opes; - for (auto i = 0u; i < vs.size(); i++) { - opes.emplace_back(std::any_cast>(vs[i])); - } - const std::shared_ptr ope = - std::make_shared(opes); - return ope; - } - }; - - g["Sequence"] = [&](const SemanticValues &vs) { - if (vs.empty()) { - return npd(lit("")); - } else if (vs.size() == 1) { - return std::any_cast>(vs[0]); - } else { - std::vector> opes; - for (const auto &x : vs) { - opes.emplace_back(std::any_cast>(x)); - } - const std::shared_ptr ope = std::make_shared(opes); - return ope; - } - }; - - g["Prefix"] = [&](const SemanticValues &vs) { - std::shared_ptr ope; - if (vs.size() == 1) { - ope = std::any_cast>(vs[0]); - } else { - assert(vs.size() == 2); - auto tok = std::any_cast(vs[0]); - ope = std::any_cast>(vs[1]); - if (tok == '&') { - ope = apd(ope); - } else { // '!' - ope = npd(ope); - } - } - return ope; - }; - - g["SuffixWithLabel"] = [&](const SemanticValues &vs, std::any &dt) { - auto ope = std::any_cast>(vs[0]); - if (vs.size() == 1) { - return ope; - } else { - assert(vs.size() == 2); - auto &data = *std::any_cast(dt); - const auto &ident = std::any_cast(vs[1]); - auto label = ref(*data.grammar, ident, vs.sv().data(), false, {}); - auto recovery = rec(ref(*data.grammar, RECOVER_DEFINITION_NAME, - vs.sv().data(), true, {label})); - return cho4label_(ope, recovery); - } - }; - - struct Loop { - enum class Type { opt = 0, zom, oom, rep }; - Type type; - std::pair range; - }; - - g["Suffix"] = [&](const SemanticValues &vs) { - auto ope = std::any_cast>(vs[0]); - if (vs.size() == 1) { - return ope; - } else { - assert(vs.size() == 2); - auto loop = std::any_cast(vs[1]); - switch (loop.type) { - case Loop::Type::opt: return opt(ope); - case Loop::Type::zom: return zom(ope); - case Loop::Type::oom: return oom(ope); - default: // Regex-like repetition - return rep(ope, loop.range.first, loop.range.second); - } - } - }; - - g["Loop"] = [&](const SemanticValues &vs) { - switch (vs.choice()) { - case 0: // Option - return Loop{Loop::Type::opt, std::pair()}; - case 1: // Zero or More - return Loop{Loop::Type::zom, std::pair()}; - case 2: // One or More - return Loop{Loop::Type::oom, std::pair()}; - default: // Regex-like repetition - return Loop{Loop::Type::rep, - std::any_cast>(vs[0])}; - } - }; - - g["Primary"] = [&](const SemanticValues &vs, std::any &dt) { - auto &data = *std::any_cast(dt); - - switch (vs.choice()) { - case 0: // Macro Reference - case 1: { // Reference - auto is_macro = vs.choice() == 0; - auto ignore = std::any_cast(vs[0]); - const auto &ident = std::any_cast(vs[1]); - - std::vector> args; - if (is_macro) { - args = std::any_cast>>(vs[2]); - } - - auto ope = ref(*data.grammar, ident, vs.sv().data(), is_macro, args); - if (ident == RECOVER_DEFINITION_NAME) { ope = rec(ope); } - - if (ignore) { - return ign(ope); - } else { - return ope; - } - } - case 2: { // (Expression) - return std::any_cast>(vs[0]); - } - case 3: { // TokenBoundary - return tok(std::any_cast>(vs[0])); - } - case 4: { // CaptureScope - return csc(std::any_cast>(vs[0])); - } - case 5: { // Capture - const auto &name = std::any_cast(vs[0]); - auto ope = std::any_cast>(vs[1]); - - data.captures_stack.back().insert(name); - data.captures_in_current_definition.insert(name); - - return cap(ope, [name](const char *a_s, size_t a_n, Context &c) { - auto &cs = c.capture_scope_stack[c.capture_scope_stack_size - 1]; - cs[name] = std::string(a_s, a_n); - }); - } - default: { - return std::any_cast>(vs[0]); - } - } - }; - - g["IdentCont"] = [](const SemanticValues &vs) { - return std::string(vs.sv().data(), vs.sv().length()); - }; - - g["Dictionary"] = [](const SemanticValues &vs) { - auto items = vs.transform(); - return dic(items, false); - }; - g["DictionaryI"] = [](const SemanticValues &vs) { - auto items = vs.transform(); - return dic(items, true); - }; - - g["Literal"] = [](const SemanticValues &vs) { - const auto &tok = vs.tokens.front(); - return lit(resolve_escape_sequence(tok.data(), tok.size())); - }; - g["LiteralI"] = [](const SemanticValues &vs) { - const auto &tok = vs.tokens.front(); - return liti(resolve_escape_sequence(tok.data(), tok.size())); - }; - g["LiteralD"] = [](const SemanticValues &vs) { - auto &tok = vs.tokens.front(); - return resolve_escape_sequence(tok.data(), tok.size()); - }; - g["LiteralID"] = [](const SemanticValues &vs) { - auto &tok = vs.tokens.front(); - return resolve_escape_sequence(tok.data(), tok.size()); - }; - - g["Class"] = [](const SemanticValues &vs) { - auto ranges = vs.transform>(); - return cls(ranges); - }; - g["ClassI"] = [](const SemanticValues &vs) { - auto ranges = vs.transform>(); - return cls(ranges, true); - }; - g["NegatedClass"] = [](const SemanticValues &vs) { - auto ranges = vs.transform>(); - return ncls(ranges); - }; - g["NegatedClassI"] = [](const SemanticValues &vs) { - auto ranges = vs.transform>(); - return ncls(ranges, true); - }; - g["Range"] = [](const SemanticValues &vs) { - switch (vs.choice()) { - case 0: { - auto s1 = std::any_cast(vs[0]); - auto s2 = std::any_cast(vs[1]); - auto cp1 = decode_codepoint(s1.data(), s1.length()); - auto cp2 = decode_codepoint(s2.data(), s2.length()); - return std::pair(cp1, cp2); - } - case 1: { - auto s = std::any_cast(vs[0]); - auto cp = decode_codepoint(s.data(), s.length()); - return std::pair(cp, cp); - } - } - return std::pair(0, 0); - }; - g["Char"] = [](const SemanticValues &vs) { - return resolve_escape_sequence(vs.sv().data(), vs.sv().length()); - }; - - g["RepetitionRange"] = [&](const SemanticValues &vs) { - switch (vs.choice()) { - case 0: { // Number COMMA Number - auto min = std::any_cast(vs[0]); - auto max = std::any_cast(vs[1]); - return std::pair(min, max); - } - case 1: // Number COMMA - return std::pair(std::any_cast(vs[0]), - std::numeric_limits::max()); - case 2: { // Number - auto n = std::any_cast(vs[0]); - return std::pair(n, n); - } - default: // COMMA Number - return std::pair(std::numeric_limits::min(), - std::any_cast(vs[0])); - } - }; - g["Number"] = [&](const SemanticValues &vs) { - return vs.token_to_number(); - }; - - g["CapScope"].enter = [](const Context & /*c*/, const char * /*s*/, - size_t /*n*/, std::any &dt) { - auto &data = *std::any_cast(dt); - data.captures_stack.emplace_back(); - }; - g["CapScope"].leave = [](const Context & /*c*/, const char * /*s*/, - size_t /*n*/, size_t /*matchlen*/, - std::any & /*value*/, std::any &dt) { - auto &data = *std::any_cast(dt); - data.captures_stack.pop_back(); - }; - - g["AND"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; - g["NOT"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; - g["QUESTION"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; - g["STAR"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; - g["PLUS"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; - - g["DOT"] = [](const SemanticValues & /*vs*/) { return dot(); }; - - g["CUT"] = [](const SemanticValues & /*vs*/) { return cut(); }; - - g["BeginCap"] = [](const SemanticValues &vs) { return vs.token(); }; - - g["BackRef"] = [&](const SemanticValues &vs, std::any &dt) { - auto &data = *std::any_cast(dt); - - // Undefined back reference check - { - auto found = false; - auto it = data.captures_stack.rbegin(); - while (it != data.captures_stack.rend()) { - if (it->find(vs.token()) != it->end()) { - found = true; - break; - } - ++it; - } - if (!found) { - auto ptr = vs.token().data() - 1; // include '$' symbol - data.undefined_back_references.emplace_back(vs.token(), ptr); - } - } - - // NOTE: Disable packrat parsing if a back reference is not defined in - // captures in the current definition rule. - if (data.captures_in_current_definition.find(vs.token()) == - data.captures_in_current_definition.end()) { - data.enablePackratParsing = false; - } - - return bkr(vs.token_to_string()); - }; - - g["Ignore"] = [](const SemanticValues &vs) { return vs.size() > 0; }; - - g["Parameters"] = [](const SemanticValues &vs) { - return vs.transform(); - }; - - g["Arguments"] = [](const SemanticValues &vs) { - return vs.transform>(); - }; - - g["PrecedenceClimbing"] = [](const SemanticValues &vs) { - PrecedenceClimbing::BinOpeInfo binOpeInfo; - size_t level = 1; - for (auto v : vs) { - auto tokens = std::any_cast>(v); - auto assoc = tokens[0][0]; - for (size_t i = 1; i < tokens.size(); i++) { - binOpeInfo[tokens[i]] = std::pair(level, assoc); - } - level++; - } - Instruction instruction; - instruction.type = "precedence"; - instruction.data = binOpeInfo; - instruction.sv = vs.sv(); - return instruction; - }; - g["PrecedenceInfo"] = [](const SemanticValues &vs) { - return vs.transform(); - }; - g["PrecedenceOpe"] = [](const SemanticValues &vs) { return vs.token(); }; - g["PrecedenceAssoc"] = [](const SemanticValues &vs) { return vs.token(); }; - - g["ErrorMessage"] = [](const SemanticValues &vs) { - Instruction instruction; - instruction.type = "error_message"; - instruction.data = std::any_cast(vs[0]); - instruction.sv = vs.sv(); - return instruction; - }; - - g["NoAstOpt"] = [](const SemanticValues &vs) { - Instruction instruction; - instruction.type = "no_ast_opt"; - instruction.sv = vs.sv(); - return instruction; - }; - - g["Instruction"] = [](const SemanticValues &vs) { - return vs.transform(); - }; - } - - bool apply_precedence_instruction(Definition &rule, - const PrecedenceClimbing::BinOpeInfo &info, - const char *s, Log log) { - try { - auto &seq = dynamic_cast(*rule.get_core_operator()); - auto atom = seq.opes_[0]; - auto &rep = dynamic_cast(*seq.opes_[1]); - auto &seq1 = dynamic_cast(*rep.ope_); - auto binop = seq1.opes_[0]; - auto atom1 = seq1.opes_[1]; - - auto atom_name = dynamic_cast(*atom).name_; - auto binop_name = dynamic_cast(*binop).name_; - auto atom1_name = dynamic_cast(*atom1).name_; - - if (!rep.is_zom() || atom_name != atom1_name || atom_name == binop_name) { - if (log) { - auto line = line_info(s, rule.s_); - log(line.first, line.second, - "'precedence' instruction cannot be applied to '" + rule.name + - "'.", - ""); - } - return false; - } - - rule.holder_->ope_ = pre(atom, binop, info, rule); - rule.disable_action = true; - } catch (...) { - if (log) { - auto line = line_info(s, rule.s_); - log(line.first, line.second, - "'precedence' instruction cannot be applied to '" + rule.name + - "'.", - ""); - } - return false; + g["Sequence"] = [&](const SemanticValues &vs) { + if (vs.empty()) { + return npd(lit("")); + } else if (vs.size() == 1) { + return std::any_cast>(vs[0]); + } else { + std::vector> opes; + for (const auto &x : vs) { + opes.emplace_back(std::any_cast>(x)); } - return true; - } + const std::shared_ptr ope = std::make_shared(opes); + return ope; + } + }; - ParserContext perform_core(const char *s, size_t n, const Rules &rules, - Log log, std::string requested_start) { - Data data; - auto &grammar = *data.grammar; + g["Prefix"] = [&](const SemanticValues &vs) { + std::shared_ptr ope; + if (vs.size() == 1) { + ope = std::any_cast>(vs[0]); + } else { + assert(vs.size() == 2); + auto tok = std::any_cast(vs[0]); + ope = std::any_cast>(vs[1]); + if (tok == '&') { + ope = apd(ope); + } else { // '!' + ope = npd(ope); + } + } + return ope; + }; - // Built-in macros - { - // `%recover` - { - auto &rule = grammar[RECOVER_DEFINITION_NAME]; - rule <= ref(grammar, "x", "", false, {}); - rule.name = RECOVER_DEFINITION_NAME; - rule.s_ = "[native]"; - rule.ignoreSemanticValue = true; - rule.is_macro = true; - rule.params = {"x"}; - } + g["SuffixWithLabel"] = [&](const SemanticValues &vs, std::any &dt) { + auto ope = std::any_cast>(vs[0]); + if (vs.size() == 1) { + return ope; + } else { + assert(vs.size() == 2); + auto &data = *std::any_cast(dt); + const auto &ident = std::any_cast(vs[1]); + auto label = ref(*data.grammar, ident, vs.sv().data(), false, {}); + auto recovery = rec(ref(*data.grammar, RECOVER_DEFINITION_NAME, + vs.sv().data(), true, {label})); + return cho4label_(ope, recovery); + } + }; + + struct Loop { + enum class Type { opt = 0, zom, oom, rep }; + Type type; + std::pair range; + }; + + g["Suffix"] = [&](const SemanticValues &vs) { + auto ope = std::any_cast>(vs[0]); + if (vs.size() == 1) { + return ope; + } else { + assert(vs.size() == 2); + auto loop = std::any_cast(vs[1]); + switch (loop.type) { + case Loop::Type::opt: return opt(ope); + case Loop::Type::zom: return zom(ope); + case Loop::Type::oom: return oom(ope); + default: // Regex-like repetition + return rep(ope, loop.range.first, loop.range.second); + } + } + }; + + g["Loop"] = [&](const SemanticValues &vs) { + switch (vs.choice()) { + case 0: // Option + return Loop{Loop::Type::opt, std::pair()}; + case 1: // Zero or More + return Loop{Loop::Type::zom, std::pair()}; + case 2: // One or More + return Loop{Loop::Type::oom, std::pair()}; + default: // Regex-like repetition + return Loop{Loop::Type::rep, + std::any_cast>(vs[0])}; + } + }; + + g["Primary"] = [&](const SemanticValues &vs, std::any &dt) { + auto &data = *std::any_cast(dt); + + switch (vs.choice()) { + case 0: // Macro Reference + case 1: { // Reference + auto is_macro = vs.choice() == 0; + auto ignore = std::any_cast(vs[0]); + const auto &ident = std::any_cast(vs[1]); + + std::vector> args; + if (is_macro) { + args = std::any_cast>>(vs[2]); } - std::any dt = &data; - auto r = g["Grammar"].parse(s, n, dt, nullptr, log); + auto ope = ref(*data.grammar, ident, vs.sv().data(), is_macro, args); + if (ident == RECOVER_DEFINITION_NAME) { ope = rec(ope); } - if (!r.ret) { - if (log) { - if (r.error_info.message_pos) { - auto line = line_info(s, r.error_info.message_pos); - log(line.first, line.second, r.error_info.message, - r.error_info.label); - } else { - auto line = line_info(s, r.error_info.error_pos); - log(line.first, line.second, "syntax error", r.error_info.label); - } - } - return {}; + if (ignore) { + return ign(ope); + } else { + return ope; } + } + case 2: { // (Expression) + return std::any_cast>(vs[0]); + } + case 3: { // TokenBoundary + return tok(std::any_cast>(vs[0])); + } + case 4: { // CaptureScope + return csc(std::any_cast>(vs[0])); + } + case 5: { // Capture + const auto &name = std::any_cast(vs[0]); + auto ope = std::any_cast>(vs[1]); - // User provided rules - for (auto [user_name, user_rule] : rules) { - auto name = user_name; - auto ignore = false; - if (!name.empty() && name[0] == '~') { - ignore = true; - name.erase(0, 1); - } - if (!name.empty()) { - auto &rule = grammar[name]; - rule <= user_rule; - rule.name = name; - rule.ignoreSemanticValue = ignore; - } + data.captures_stack.back().insert(name); + data.captures_in_current_definition.insert(name); + + return cap(ope, [name](const char *a_s, size_t a_n, Context &c) { + c.capture_entries.emplace_back(name, std::string(a_s, a_n)); + }); + } + default: { + return std::any_cast>(vs[0]); + } + } + }; + + g["IdentCont"] = [](const SemanticValues &vs) { + return std::string(vs.sv().data(), vs.sv().length()); + }; + + g["Dictionary"] = [](const SemanticValues &vs) { + auto items = vs.transform(); + return dic(items, false); + }; + g["DictionaryI"] = [](const SemanticValues &vs) { + auto items = vs.transform(); + return dic(items, true); + }; + + g["Literal"] = [](const SemanticValues &vs) { + const auto &tok = vs.tokens.front(); + return lit(resolve_escape_sequence(tok.data(), tok.size())); + }; + g["LiteralI"] = [](const SemanticValues &vs) { + const auto &tok = vs.tokens.front(); + return liti(resolve_escape_sequence(tok.data(), tok.size())); + }; + g["LiteralD"] = [](const SemanticValues &vs) { + auto &tok = vs.tokens.front(); + return resolve_escape_sequence(tok.data(), tok.size()); + }; + g["LiteralID"] = [](const SemanticValues &vs) { + auto &tok = vs.tokens.front(); + return resolve_escape_sequence(tok.data(), tok.size()); + }; + + g["Class"] = [](const SemanticValues &vs) { + auto ranges = vs.transform>(); + return cls(ranges); + }; + g["ClassI"] = [](const SemanticValues &vs) { + auto ranges = vs.transform>(); + return cls(ranges, true); + }; + g["NegatedClass"] = [](const SemanticValues &vs) { + auto ranges = vs.transform>(); + return ncls(ranges); + }; + g["NegatedClassI"] = [](const SemanticValues &vs) { + auto ranges = vs.transform>(); + return ncls(ranges, true); + }; + g["Range"] = [](const SemanticValues &vs) { + switch (vs.choice()) { + case 0: { + auto s1 = std::any_cast(vs[0]); + auto s2 = std::any_cast(vs[1]); + auto cp1 = decode_codepoint(s1.data(), s1.length()); + auto cp2 = decode_codepoint(s2.data(), s2.length()); + if (cp1 > cp2) { + throw SyntaxErrorException("characer range is out of order...", + vs.line_info()); } + return std::pair(cp1, cp2); + } + case 1: { + auto s = std::any_cast(vs[0]); + auto cp = decode_codepoint(s.data(), s.length()); + return std::pair(cp, cp); + } + } + return std::pair(0, 0); + }; + g["Char"] = [](const SemanticValues &vs) { + return resolve_escape_sequence(vs.sv().data(), vs.sv().length()); + }; - // Check duplicated definitions - auto ret = true; + g["RepetitionRange"] = [&](const SemanticValues &vs) { + switch (vs.choice()) { + case 0: { // Number COMMA Number + auto min = std::any_cast(vs[0]); + auto max = std::any_cast(vs[1]); + return std::pair(min, max); + } + case 1: // Number COMMA + return std::pair(std::any_cast(vs[0]), + std::numeric_limits::max()); + case 2: { // Number + auto n = std::any_cast(vs[0]); + return std::pair(n, n); + } + default: // COMMA Number + return std::pair(std::numeric_limits::min(), + std::any_cast(vs[0])); + } + }; + g["Number"] = [&](const SemanticValues &vs) { + return vs.token_to_number(); + }; - if (!data.duplicates_of_definition.empty()) { - for (const auto &[name, ptr] : data.duplicates_of_definition) { - if (log) { - auto line = line_info(s, ptr); - log(line.first, line.second, - "The definition '" + name + "' is already defined.", ""); - } - } - ret = false; + g["CapScope"].enter = [](const Context & /*c*/, const char * /*s*/, + size_t /*n*/, std::any &dt) { + auto &data = *std::any_cast(dt); + data.captures_stack.emplace_back(); + }; + g["CapScope"].leave = [](const Context & /*c*/, const char * /*s*/, + size_t /*n*/, size_t /*matchlen*/, + std::any & /*value*/, std::any &dt) { + auto &data = *std::any_cast(dt); + data.captures_stack.pop_back(); + }; + + g["AND"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; + g["NOT"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; + g["QUESTION"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; + g["STAR"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; + g["PLUS"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; + + g["DOT"] = [](const SemanticValues & /*vs*/) { return dot(); }; + + g["CUT"] = [](const SemanticValues & /*vs*/) { return cut(); }; + + g["BeginCap"] = [](const SemanticValues &vs) { return vs.token(); }; + + g["BackRef"] = [&](const SemanticValues &vs, std::any &dt) { + auto &data = *std::any_cast(dt); + + // Undefined back reference check + { + auto found = false; + auto it = data.captures_stack.rbegin(); + while (it != data.captures_stack.rend()) { + if (it->find(vs.token()) != it->end()) { + found = true; + break; + } + ++it; } - - // Check duplicated instructions - if (!data.duplicates_of_instruction.empty()) { - for (const auto &[type, ptr] : data.duplicates_of_instruction) { - if (log) { - auto line = line_info(s, ptr); - log(line.first, line.second, - "The instruction '" + type + "' is already defined.", ""); - } - } - ret = false; + if (!found) { + auto ptr = vs.token().data() - 1; // include '$' symbol + data.undefined_back_references.emplace_back(vs.token(), ptr); } + } - // Check undefined back references - if (!data.undefined_back_references.empty()) { - for (const auto &[name, ptr] : data.undefined_back_references) { - if (log) { - auto line = line_info(s, ptr); - log(line.first, line.second, - "The back reference '" + name + "' is undefined.", ""); - } - } - ret = false; + // NOTE: Disable packrat parsing if a back reference is not defined in + // captures in the current definition rule. + if (data.captures_in_current_definition.find(vs.token()) == + data.captures_in_current_definition.end()) { + data.enablePackratParsing = false; + } + + return bkr(vs.token_to_string()); + }; + + g["Ignore"] = [](const SemanticValues &vs) { return vs.size() > 0; }; + + g["Parameters"] = [](const SemanticValues &vs) { + return vs.transform(); + }; + + g["Arguments"] = [](const SemanticValues &vs) { + return vs.transform>(); + }; + + g["PrecedenceClimbing"] = [](const SemanticValues &vs) { + PrecedenceClimbing::BinOpeInfo binOpeInfo; + size_t level = 1; + for (const auto &v : vs) { + auto tokens = std::any_cast>(v); + auto assoc = tokens[0][0]; + for (size_t i = 1; i < tokens.size(); i++) { + binOpeInfo[tokens[i]] = std::pair(level, assoc); } + level++; + } + Instruction instruction; + instruction.type = "precedence"; + instruction.data = binOpeInfo; + instruction.sv = vs.sv(); + return instruction; + }; + g["PrecedenceInfo"] = [](const SemanticValues &vs) { + return vs.transform(); + }; + g["PrecedenceOpe"] = [](const SemanticValues &vs) { return vs.token(); }; + g["PrecedenceAssoc"] = [](const SemanticValues &vs) { return vs.token(); }; - // Set root definition - auto start = data.start; + g["ErrorMessage"] = [](const SemanticValues &vs) { + Instruction instruction; + instruction.type = "error_message"; + instruction.data = std::any_cast(vs[0]); + instruction.sv = vs.sv(); + return instruction; + }; - if (!requested_start.empty()) { - if (grammar.count(requested_start)) { - start = requested_start; - } else { - if (log) { - auto line = line_info(s, s); - log(line.first, line.second, - "The specified start rule '" + requested_start + - "' is undefined.", - ""); - } - ret = false; - } - } + g["NoAstOpt"] = [](const SemanticValues &vs) { + Instruction instruction; + instruction.type = "no_ast_opt"; + instruction.sv = vs.sv(); + return instruction; + }; - if (!ret) { return {}; } + g["Instruction"] = [](const SemanticValues &vs) { + return vs.transform(); + }; + } - auto &start_rule = grammar[start]; + bool apply_precedence_instruction(Definition &rule, + const PrecedenceClimbing::BinOpeInfo &info, + const char *s, Log log) { + try { + auto &seq = dynamic_cast(*rule.get_core_operator()); + auto atom = seq.opes_[0]; + auto &rep = dynamic_cast(*seq.opes_[1]); + auto &seq1 = dynamic_cast(*rep.ope_); + auto binop = seq1.opes_[0]; + auto atom1 = seq1.opes_[1]; - // Check if the start rule has ignore operator - { - if (start_rule.ignoreSemanticValue) { - if (log) { - auto line = line_info(s, start_rule.s_); - log(line.first, line.second, - "Ignore operator cannot be applied to '" + start_rule.name + "'.", - ""); - } - ret = false; - } - } + auto atom_name = dynamic_cast(*atom).name_; + auto binop_name = dynamic_cast(*binop).name_; + auto atom1_name = dynamic_cast(*atom1).name_; - if (!ret) { return {}; } - - // Check missing definitions - auto referenced = std::unordered_set{ - WHITESPACE_DEFINITION_NAME, - WORD_DEFINITION_NAME, - RECOVER_DEFINITION_NAME, - start_rule.name, - }; - - for (auto &[_, rule] : grammar) { - ReferenceChecker vis(grammar, rule.params); - rule.accept(vis); - referenced.insert(vis.referenced.begin(), vis.referenced.end()); - for (const auto &[name, ptr] : vis.error_s) { - if (log) { - auto line = line_info(s, ptr); - log(line.first, line.second, vis.error_message[name], ""); - } - ret = false; - } - } - - for (auto &[name, rule] : grammar) { - if (!referenced.count(name)) { - if (log) { - auto line = line_info(s, rule.s_); - auto msg = "'" + name + "' is not referenced."; - log(line.first, line.second, msg, ""); - } - } - } - - if (!ret) { return {}; } - - // Link references - for (auto &x : grammar) { - auto &rule = x.second; - LinkReferences vis(grammar, rule.params); - rule.accept(vis); - } - - // Check left recursion - ret = true; - - for (auto &[name, rule] : grammar) { - DetectLeftRecursion vis(name); - rule.accept(vis); - if (vis.error_s) { - if (log) { - auto line = line_info(s, vis.error_s); - log(line.first, line.second, "'" + name + "' is left recursive.", ""); - } - ret = false; - } - } - - if (!ret) { return {}; } - - // Check infinite loop - if (detect_infiniteLoop(data, start_rule, log, s)) { return {}; } - - // Automatic whitespace skipping - if (grammar.count(WHITESPACE_DEFINITION_NAME)) { - for (auto &x : grammar) { - auto &rule = x.second; - auto ope = rule.get_core_operator(); - if (IsLiteralToken::check(*ope)) { rule <= tok(ope); } - } - - auto &rule = grammar[WHITESPACE_DEFINITION_NAME]; - start_rule.whitespaceOpe = wsp(rule.get_core_operator()); - - if (detect_infiniteLoop(data, rule, log, s)) { return {}; } - } - - // Word expression - if (grammar.count(WORD_DEFINITION_NAME)) { - auto &rule = grammar[WORD_DEFINITION_NAME]; - start_rule.wordOpe = rule.get_core_operator(); - - if (detect_infiniteLoop(data, rule, log, s)) { return {}; } - } - - // Apply instructions - for (const auto &[name, instructions] : data.instructions) { - auto &rule = grammar[name]; - - for (const auto &instruction : instructions) { - if (instruction.type == "precedence") { - const auto &info = - std::any_cast(instruction.data); - - if (!apply_precedence_instruction(rule, info, s, log)) { return {}; } - } else if (instruction.type == "error_message") { - rule.error_message = std::any_cast(instruction.data); - } else if (instruction.type == "no_ast_opt") { - rule.no_ast_opt = true; - } - } - } - - return {data.grammar, start, data.enablePackratParsing}; - } - - bool detect_infiniteLoop(const Data &data, Definition &rule, const Log &log, - const char *s) const { - std::vector> refs; - std::unordered_map has_error_cache; - DetectInfiniteLoop vis(data.start_pos, rule.name, refs, has_error_cache); - rule.accept(vis); - if (vis.has_error) { - if (log) { - auto line = line_info(s, vis.error_s); - log(line.first, line.second, - "infinite loop is detected in '" + vis.error_name + "'.", ""); - } - return true; + if (!rep.is_zom() || atom_name != atom1_name || atom_name == binop_name) { + if (log) { + auto line = line_info(s, rule.s_); + log(line.first, line.second, + "'precedence' instruction cannot be applied to '" + rule.name + + "'.", + ""); } return false; + } + + rule.holder_->ope_ = pre(atom, binop, info, rule); + rule.disable_action = true; + } catch (...) { + if (log) { + auto line = line_info(s, rule.s_); + log(line.first, line.second, + "'precedence' instruction cannot be applied to '" + rule.name + + "'.", + ""); + } + return false; + } + return true; + } + + ParserContext perform_core(const char *s, size_t n, const Rules &rules, + Log log, std::string requested_start, + bool enable_left_recursion = true) { + Data data; + auto &grammar = *data.grammar; + + // Built-in macros + { + // `%recover` + { + auto &rule = grammar[RECOVER_DEFINITION_NAME]; + rule <= ref(grammar, "x", "", false, {}); + rule.name = RECOVER_DEFINITION_NAME; + rule.s_ = "[native]"; + rule.ignoreSemanticValue = true; + rule.is_macro = true; + rule.params = {"x"}; + } } - Grammar g; + try { + std::any dt = &data; + auto r = g["Grammar"].parse(s, n, dt, nullptr, log); + + if (!r.ret) { + if (log) { + if (r.error_info.message_pos) { + auto line = line_info(s, r.error_info.message_pos); + log(line.first, line.second, r.error_info.message, + r.error_info.label); + } else { + auto line = line_info(s, r.error_info.error_pos); + log(line.first, line.second, "syntax error", r.error_info.label); + } + } + return {}; + } + } catch (const SyntaxErrorException &e) { + if (log) { + auto line = e.line_info(); + log(line.first, line.second, e.what(), ""); + } + return {}; + } + + // User provided rules + for (auto [user_name, user_rule] : rules) { + auto name = user_name; + auto ignore = false; + if (!name.empty() && name[0] == '~') { + ignore = true; + name.erase(0, 1); + } + if (!name.empty()) { + auto &rule = grammar[name]; + rule <= user_rule; + rule.name = name; + rule.ignoreSemanticValue = ignore; + } + } + + // Check duplicated definitions + auto ret = true; + + if (!data.duplicates_of_definition.empty()) { + for (const auto &[name, ptr] : data.duplicates_of_definition) { + if (log) { + auto line = line_info(s, ptr); + log(line.first, line.second, + "the definition '" + name + "' is already defined.", ""); + } + } + ret = false; + } + + // Check duplicated instructions + if (!data.duplicates_of_instruction.empty()) { + for (const auto &[type, ptr] : data.duplicates_of_instruction) { + if (log) { + auto line = line_info(s, ptr); + log(line.first, line.second, + "the instruction '" + type + "' is already defined.", ""); + } + } + ret = false; + } + + // Check undefined back references + if (!data.undefined_back_references.empty()) { + for (const auto &[name, ptr] : data.undefined_back_references) { + if (log) { + auto line = line_info(s, ptr); + log(line.first, line.second, + "the back reference '" + name + "' is undefined.", ""); + } + } + ret = false; + } + + // Set root definition + auto start = data.start; + + if (!requested_start.empty()) { + if (grammar.count(requested_start)) { + start = requested_start; + } else { + if (log) { + auto line = line_info(s, s); + log(line.first, line.second, + "the specified start rule '" + requested_start + + "' is undefined.", + ""); + } + ret = false; + } + } + + if (!ret) { return {}; } + + auto &start_rule = grammar[start]; + + // Check if the start rule has ignore operator + { + if (start_rule.ignoreSemanticValue) { + if (log) { + auto line = line_info(s, start_rule.s_); + log(line.first, line.second, + "ignore operator cannot be applied to '" + start_rule.name + "'.", + ""); + } + ret = false; + } + } + + if (!ret) { return {}; } + + // Check missing definitions + auto referenced = std::unordered_set{ + WHITESPACE_DEFINITION_NAME, + WORD_DEFINITION_NAME, + RECOVER_DEFINITION_NAME, + start_rule.name, + }; + + for (auto &[_, rule] : grammar) { + ReferenceChecker vis(grammar, rule.params); + rule.accept(vis); + referenced.insert(vis.referenced.begin(), vis.referenced.end()); + for (const auto &[name, ptr] : vis.error_s) { + if (log) { + auto line = line_info(s, ptr); + log(line.first, line.second, vis.error_message[name], ""); + } + ret = false; + } + } + + for (auto &[name, rule] : grammar) { + if (!referenced.count(name)) { + if (log) { + auto line = line_info(s, rule.s_); + auto msg = "'" + name + "' is not referenced."; + log(line.first, line.second, msg, ""); + } + } + } + + if (!ret) { return {}; } + + // Link references + for (auto &x : grammar) { + auto &rule = x.second; + LinkReferences vis(grammar, rule.params); + rule.accept(vis); + } + + // Compute can_be_empty for each rule (fixed-point iteration) + { + bool changed = true; + while (changed) { + changed = false; + for (auto &[name, rule] : grammar) { + ComputeCanBeEmpty vis; + rule.accept(vis); + if (vis.result != rule.can_be_empty) { + rule.can_be_empty = vis.result; + changed = true; + } + } + } + } + + // Check left recursion + if (enable_left_recursion) { + for (auto &[name, rule] : grammar) { + DetectLeftRecursion vis(name); + rule.accept(vis); + if (vis.error_s) { rule.is_left_recursive = true; } + } + } else { + ret = true; + + for (auto &[name, rule] : grammar) { + DetectLeftRecursion vis(name); + rule.accept(vis); + if (vis.error_s) { + if (log) { + auto line = line_info(s, vis.error_s); + log(line.first, line.second, "'" + name + "' is left recursive.", + ""); + } + ret = false; + } + } + + if (!ret) { return {}; } + } + + // Check infinite loop + if (detect_infiniteLoop(data, start_rule, log, s)) { return {}; } + + // Automatic whitespace skipping + if (grammar.count(WHITESPACE_DEFINITION_NAME)) { + for (auto &x : grammar) { + auto &rule = x.second; + auto ope = rule.get_core_operator(); + if (IsLiteralToken::check(*ope)) { rule <= tok(ope); } + } + + auto &rule = grammar[WHITESPACE_DEFINITION_NAME]; + start_rule.whitespaceOpe = wsp(rule.get_core_operator()); + + if (detect_infiniteLoop(data, rule, log, s)) { return {}; } + } + + // Word expression + if (grammar.count(WORD_DEFINITION_NAME)) { + auto &rule = grammar[WORD_DEFINITION_NAME]; + start_rule.wordOpe = rule.get_core_operator(); + + if (detect_infiniteLoop(data, rule, log, s)) { return {}; } + } + + // Apply instructions + for (const auto &[name, instructions] : data.instructions) { + auto &rule = grammar[name]; + + for (const auto &instruction : instructions) { + if (instruction.type == "precedence") { + const auto &info = + std::any_cast(instruction.data); + + if (!apply_precedence_instruction(rule, info, s, log)) { return {}; } + } else if (instruction.type == "error_message") { + rule.error_message = std::any_cast(instruction.data); + } else if (instruction.type == "no_ast_opt") { + rule.no_ast_opt = true; + } + } + } + + // Setup First-Set and ISpan optimizations + for (auto &x : grammar) { + SetupFirstSets vis; + x.second.accept(vis); + } + + return {data.grammar, start, data.enablePackratParsing}; + } + + bool detect_infiniteLoop(const Data &data, Definition &rule, const Log &log, + const char *s) const { + std::vector> refs; + std::unordered_map has_error_cache; + DetectInfiniteLoop vis(data.start_pos, rule.name, refs, has_error_cache); + rule.accept(vis); + if (vis.has_error) { + if (log) { + auto line = line_info(s, vis.error_s); + log(line.first, line.second, + "infinite loop is detected in '" + vis.error_name + "'.", ""); + } + return true; + } + return false; + } + + Grammar g; }; /*----------------------------------------------------------------------------- @@ -4254,163 +5069,163 @@ private: *---------------------------------------------------------------------------*/ template struct AstBase : public Annotation { - AstBase(const char *path, size_t line, size_t column, const char *name, - const std::vector> &nodes, - size_t position = 0, size_t length = 0, size_t choice_count = 0, - size_t choice = 0) - : path(path ? path : ""), line(line), column(column), name(name), - position(position), length(length), choice_count(choice_count), - choice(choice), original_name(name), - original_choice_count(choice_count), original_choice(choice), - tag(str2tag(name)), original_tag(tag), is_token(false), nodes(nodes) {} + AstBase(const char *path, size_t line, size_t column, const char *name, + const std::vector> &nodes, + size_t position = 0, size_t length = 0, size_t choice_count = 0, + size_t choice = 0) + : path(path ? path : ""), line(line), column(column), name(name), + position(position), length(length), choice_count(choice_count), + choice(choice), original_name(name), + original_choice_count(choice_count), original_choice(choice), + tag(str2tag(name)), original_tag(tag), is_token(false), nodes(nodes) {} - AstBase(const char *path, size_t line, size_t column, const char *name, - const std::string_view &token, size_t position = 0, size_t length = 0, - size_t choice_count = 0, size_t choice = 0) - : path(path ? path : ""), line(line), column(column), name(name), - position(position), length(length), choice_count(choice_count), - choice(choice), original_name(name), - original_choice_count(choice_count), original_choice(choice), - tag(str2tag(name)), original_tag(tag), is_token(true), token(token) {} + AstBase(const char *path, size_t line, size_t column, const char *name, + const std::string_view &token, size_t position = 0, size_t length = 0, + size_t choice_count = 0, size_t choice = 0) + : path(path ? path : ""), line(line), column(column), name(name), + position(position), length(length), choice_count(choice_count), + choice(choice), original_name(name), + original_choice_count(choice_count), original_choice(choice), + tag(str2tag(name)), original_tag(tag), is_token(true), token(token) {} - AstBase(const AstBase &ast, const char *original_name, size_t position = 0, - size_t length = 0, size_t original_choice_count = 0, - size_t original_choice = 0) - : path(ast.path), line(ast.line), column(ast.column), name(ast.name), - position(position), length(length), choice_count(ast.choice_count), - choice(ast.choice), original_name(original_name), - original_choice_count(original_choice_count), - original_choice(original_choice), tag(ast.tag), - original_tag(str2tag(original_name)), is_token(ast.is_token), - token(ast.token), nodes(ast.nodes), parent(ast.parent) {} + AstBase(const AstBase &ast, const char *original_name, size_t position = 0, + size_t length = 0, size_t original_choice_count = 0, + size_t original_choice = 0) + : path(ast.path), line(ast.line), column(ast.column), name(ast.name), + position(position), length(length), choice_count(ast.choice_count), + choice(ast.choice), original_name(original_name), + original_choice_count(original_choice_count), + original_choice(original_choice), tag(ast.tag), + original_tag(str2tag(original_name)), is_token(ast.is_token), + token(ast.token), nodes(ast.nodes), parent(ast.parent) {} - const std::string path; - const size_t line = 1; - const size_t column = 1; + const std::string path; + const size_t line = 1; + const size_t column = 1; - const std::string name; - size_t position; - size_t length; - const size_t choice_count; - const size_t choice; - const std::string original_name; - const size_t original_choice_count; - const size_t original_choice; - const unsigned int tag; - const unsigned int original_tag; + const std::string name; + size_t position; + size_t length; + const size_t choice_count; + const size_t choice; + const std::string original_name; + const size_t original_choice_count; + const size_t original_choice; + const unsigned int tag; + const unsigned int original_tag; - const bool is_token; - const std::string_view token; + const bool is_token; + const std::string_view token; - std::vector>> nodes; - std::weak_ptr> parent; + std::vector>> nodes; + std::weak_ptr> parent; - std::string token_to_string() const { - assert(is_token); - return std::string(token); - } + std::string token_to_string() const { + assert(is_token); + return std::string(token); + } - template T token_to_number() const { - return token_to_number_(token); - } + template T token_to_number() const { + return token_to_number_(token); + } }; template void ast_to_s_core(const std::shared_ptr &ptr, std::string &s, int level, std::function fn) { - const auto &ast = *ptr; - for (auto i = 0; i < level; i++) { - s += " "; - } - auto name = ast.original_name; - if (ast.original_choice_count > 0) { - name += "/" + std::to_string(ast.original_choice); - } - if (ast.name != ast.original_name) { name += "[" + ast.name + "]"; } - if (ast.is_token) { - s += "- " + name + " ("; - s += ast.token; - s += ")\n"; - } else { - s += "+ " + name + "\n"; - } - if (fn) { s += fn(ast, level + 1); } - for (auto node : ast.nodes) { - ast_to_s_core(node, s, level + 1, fn); - } + const auto &ast = *ptr; + for (auto i = 0; i < level; i++) { + s += " "; + } + auto name = ast.original_name; + if (ast.original_choice_count > 0) { + name += "/" + std::to_string(ast.original_choice); + } + if (ast.name != ast.original_name) { name += "[" + ast.name + "]"; } + if (ast.is_token) { + s += "- " + name + " ("; + s += ast.token; + s += ")\n"; + } else { + s += "+ " + name + "\n"; + } + if (fn) { s += fn(ast, level + 1); } + for (const auto &node : ast.nodes) { + ast_to_s_core(node, s, level + 1, fn); + } } template std::string ast_to_s(const std::shared_ptr &ptr, std::function fn = nullptr) { - std::string s; - ast_to_s_core(ptr, s, 0, fn); - return s; + std::string s; + ast_to_s_core(ptr, s, 0, fn); + return s; } struct AstOptimizer { - AstOptimizer(bool mode, const std::vector &rules = {}) - : mode_(mode), rules_(rules) {} + AstOptimizer(bool mode, const std::vector &rules = {}) + : mode_(mode), rules_(rules) {} - template - std::shared_ptr optimize(std::shared_ptr original, - std::shared_ptr parent = nullptr) { - auto found = - std::find(rules_.begin(), rules_.end(), original->name) != rules_.end(); - auto opt = mode_ ? !found : found; + template + std::shared_ptr optimize(std::shared_ptr original, + std::shared_ptr parent = nullptr) { + auto found = + std::find(rules_.begin(), rules_.end(), original->name) != rules_.end(); + auto opt = mode_ ? !found : found; - if (opt && original->nodes.size() == 1) { - auto child = optimize(original->nodes[0], parent); - auto ast = std::make_shared(*child, original->name.data(), - original->choice_count, original->position, - original->length, original->choice); - for (auto node : ast->nodes) { - node->parent = ast; - } - return ast; - } - - auto ast = std::make_shared(*original); - ast->parent = parent; - ast->nodes.clear(); - for (auto node : original->nodes) { - auto child = optimize(node, ast); - ast->nodes.push_back(child); - } - return ast; + if (opt && original->nodes.size() == 1) { + auto child = optimize(original->nodes[0], parent); + auto ast = std::make_shared(*child, original->name.data(), + original->position, original->length, + original->choice_count, original->choice); + for (auto &node : ast->nodes) { + node->parent = ast; + } + return ast; } + auto ast = std::make_shared(*original); + ast->parent = parent; + ast->nodes.clear(); + for (const auto &node : original->nodes) { + auto child = optimize(node, ast); + ast->nodes.push_back(child); + } + return ast; + } + private: - const bool mode_; - const std::vector rules_; + const bool mode_; + const std::vector rules_; }; struct EmptyType {}; using Ast = AstBase; template void add_ast_action(Definition &rule) { - rule.action = [&](const SemanticValues &vs) { - auto line = vs.line_info(); + rule.action = [&](const SemanticValues &vs) { + auto line = vs.line_info(); - if (rule.is_token()) { - return std::make_shared( - vs.path, line.first, line.second, rule.name.data(), vs.token(), - std::distance(vs.ss, vs.sv().data()), vs.sv().length(), - vs.choice_count(), vs.choice()); - } + if (rule.is_token()) { + return std::make_shared( + vs.path, line.first, line.second, rule.name.data(), vs.token(), + std::distance(vs.ss, vs.sv().data()), vs.sv().length(), + vs.choice_count(), vs.choice()); + } - auto ast = - std::make_shared(vs.path, line.first, line.second, rule.name.data(), - vs.transform>(), - std::distance(vs.ss, vs.sv().data()), - vs.sv().length(), vs.choice_count(), vs.choice()); + auto ast = + std::make_shared(vs.path, line.first, line.second, rule.name.data(), + vs.transform>(), + std::distance(vs.ss, vs.sv().data()), + vs.sv().length(), vs.choice_count(), vs.choice()); - for (auto node : ast->nodes) { - node->parent = ast; - } - return ast; - }; + for (auto &node : ast->nodes) { + node->parent = ast; + } + return ast; + }; } #define PEG_EXPAND(...) __VA_ARGS__ @@ -4550,229 +5365,235 @@ template void add_ast_action(Definition &rule) { class parser { public: - parser() = default; + parser() = default; - parser(const char *s, size_t n, const Rules &rules, - std::string_view start = {}) { - load_grammar(s, n, rules, start); - } + parser(const char *s, size_t n, const Rules &rules, + std::string_view start = {}) { + load_grammar(s, n, rules, start); + } - parser(const char *s, size_t n, std::string_view start = {}) - : parser(s, n, Rules(), start) {} + parser(const char *s, size_t n, std::string_view start = {}) + : parser(s, n, Rules(), start) {} - parser(std::string_view sv, const Rules &rules, std::string_view start = {}) - : parser(sv.data(), sv.size(), rules, start) {} + parser(std::string_view sv, const Rules &rules, std::string_view start = {}) + : parser(sv.data(), sv.size(), rules, start) {} - parser(std::string_view sv, std::string_view start = {}) - : parser(sv.data(), sv.size(), Rules(), start) {} + parser(std::string_view sv, std::string_view start = {}) + : parser(sv.data(), sv.size(), Rules(), start) {} #if defined(__cpp_lib_char8_t) - parser(std::u8string_view sv, const Rules &rules, std::string_view start = {}) - : parser(reinterpret_cast(sv.data()), sv.size(), rules, - start) {} + parser(std::u8string_view sv, const Rules &rules, std::string_view start = {}) + : parser(reinterpret_cast(sv.data()), sv.size(), rules, + start) {} - parser(std::u8string_view sv, std::string_view start = {}) - : parser(reinterpret_cast(sv.data()), sv.size(), Rules(), - start) {} + parser(std::u8string_view sv, std::string_view start = {}) + : parser(reinterpret_cast(sv.data()), sv.size(), Rules(), + start) {} #endif - operator bool() { return grammar_ != nullptr; } + operator bool() const { return grammar_ != nullptr; } - bool load_grammar(const char *s, size_t n, const Rules &rules, - std::string_view start = {}) { - auto cxt = ParserGenerator::parse(s, n, rules, log_, start); - grammar_ = cxt.grammar; - start_ = cxt.start; - enablePackratParsing_ = cxt.enablePackratParsing; - return grammar_ != nullptr; + bool load_grammar(const char *s, size_t n, const Rules &rules, + std::string_view start = {}) { + auto cxt = + ParserGenerator::parse(s, n, rules, log_, start, enableLeftRecursion_); + grammar_ = cxt.grammar; + start_ = cxt.start; + enablePackratParsing_ = cxt.enablePackratParsing; + return grammar_ != nullptr; + } + + bool load_grammar(const char *s, size_t n, std::string_view start = {}) { + return load_grammar(s, n, Rules(), start); + } + + bool load_grammar(std::string_view sv, const Rules &rules, + std::string_view start = {}) { + return load_grammar(sv.data(), sv.size(), rules, start); + } + + bool load_grammar(std::string_view sv, std::string_view start = {}) { + return load_grammar(sv.data(), sv.size(), Rules(), start); + } + + bool parse_n(const char *s, size_t n, const char *path = nullptr) const { + if (grammar_ != nullptr) { + const auto &rule = (*grammar_)[start_]; + auto result = rule.parse(s, n, path, log_); + return post_process(s, n, result); } + return false; + } - bool load_grammar(const char *s, size_t n, std::string_view start = {}) { - return load_grammar(s, n, Rules(), start); - } - - bool load_grammar(std::string_view sv, const Rules &rules, - std::string_view start = {}) { - return load_grammar(sv.data(), sv.size(), rules, start); - } - - bool load_grammar(std::string_view sv, std::string_view start = {}) { - return load_grammar(sv.data(), sv.size(), start); - } - - bool parse_n(const char *s, size_t n, const char *path = nullptr) const { - if (grammar_ != nullptr) { - const auto &rule = (*grammar_)[start_]; - auto result = rule.parse(s, n, path, log_); - return post_process(s, n, result); - } - return false; - } - - bool parse_n(const char *s, size_t n, std::any &dt, - const char *path = nullptr) const { - if (grammar_ != nullptr) { - const auto &rule = (*grammar_)[start_]; - auto result = rule.parse(s, n, dt, path, log_); - return post_process(s, n, result); - } - return false; - } - - template - bool parse_n(const char *s, size_t n, T &val, - const char *path = nullptr) const { - if (grammar_ != nullptr) { - const auto &rule = (*grammar_)[start_]; - auto result = rule.parse_and_get_value(s, n, val, path, log_); - return post_process(s, n, result); - } - return false; - } - - template - bool parse_n(const char *s, size_t n, std::any &dt, T &val, - const char *path = nullptr) const { - if (grammar_ != nullptr) { - const auto &rule = (*grammar_)[start_]; - auto result = rule.parse_and_get_value(s, n, dt, val, path, log_); - return post_process(s, n, result); - } - return false; - } - - bool parse(std::string_view sv, const char *path = nullptr) const { - return parse_n(sv.data(), sv.size(), path); - } - - bool parse(std::string_view sv, std::any &dt, + bool parse_n(const char *s, size_t n, std::any &dt, const char *path = nullptr) const { - return parse_n(sv.data(), sv.size(), dt, path); + if (grammar_ != nullptr) { + const auto &rule = (*grammar_)[start_]; + auto result = rule.parse(s, n, dt, path, log_); + return post_process(s, n, result); } + return false; + } - template - bool parse(std::string_view sv, T &val, const char *path = nullptr) const { - return parse_n(sv.data(), sv.size(), val, path); - } - - template - bool parse(std::string_view sv, std::any &dt, T &val, + template + bool parse_n(const char *s, size_t n, T &val, const char *path = nullptr) const { - return parse_n(sv.data(), sv.size(), dt, val, path); + if (grammar_ != nullptr) { + const auto &rule = (*grammar_)[start_]; + auto result = rule.parse_and_get_value(s, n, val, path, log_); + return post_process(s, n, result); } + return false; + } + + template + bool parse_n(const char *s, size_t n, std::any &dt, T &val, + const char *path = nullptr) const { + if (grammar_ != nullptr) { + const auto &rule = (*grammar_)[start_]; + auto result = rule.parse_and_get_value(s, n, dt, val, path, log_); + return post_process(s, n, result); + } + return false; + } + + bool parse(std::string_view sv, const char *path = nullptr) const { + return parse_n(sv.data(), sv.size(), path); + } + + bool parse(std::string_view sv, std::any &dt, + const char *path = nullptr) const { + return parse_n(sv.data(), sv.size(), dt, path); + } + + template + bool parse(std::string_view sv, T &val, const char *path = nullptr) const { + return parse_n(sv.data(), sv.size(), val, path); + } + + template + bool parse(std::string_view sv, std::any &dt, T &val, + const char *path = nullptr) const { + return parse_n(sv.data(), sv.size(), dt, val, path); + } #if defined(__cpp_lib_char8_t) - bool parse(std::u8string_view sv, const char *path = nullptr) const { - return parse_n(reinterpret_cast(sv.data()), sv.size(), path); - } + bool parse(std::u8string_view sv, const char *path = nullptr) const { + return parse_n(reinterpret_cast(sv.data()), sv.size(), path); + } - bool parse(std::u8string_view sv, std::any &dt, - const char *path = nullptr) const { - return parse_n(reinterpret_cast(sv.data()), sv.size(), dt, - path); - } + bool parse(std::u8string_view sv, std::any &dt, + const char *path = nullptr) const { + return parse_n(reinterpret_cast(sv.data()), sv.size(), dt, + path); + } - template - bool parse(std::u8string_view sv, T &val, const char *path = nullptr) const { - return parse_n(reinterpret_cast(sv.data()), sv.size(), val, - path); - } + template + bool parse(std::u8string_view sv, T &val, const char *path = nullptr) const { + return parse_n(reinterpret_cast(sv.data()), sv.size(), val, + path); + } - template - bool parse(std::u8string_view sv, std::any &dt, T &val, - const char *path = nullptr) const { - return parse_n(reinterpret_cast(sv.data()), sv.size(), dt, - val, path); - } + template + bool parse(std::u8string_view sv, std::any &dt, T &val, + const char *path = nullptr) const { + return parse_n(reinterpret_cast(sv.data()), sv.size(), dt, + val, path); + } #endif - Definition &operator[](const char *s) { return (*grammar_)[s]; } + Definition &operator[](const char *s) { return (*grammar_)[s]; } - const Definition &operator[](const char *s) const { return (*grammar_)[s]; } + const Definition &operator[](const char *s) const { return (*grammar_)[s]; } - const Grammar &get_grammar() const { return *grammar_; } + const Grammar &get_grammar() const { return *grammar_; } - void disable_eoi_check() { - if (grammar_ != nullptr) { - auto &rule = (*grammar_)[start_]; - rule.eoi_check = false; - } + void disable_eoi_check() { + if (grammar_ != nullptr) { + auto &rule = (*grammar_)[start_]; + rule.eoi_check = false; } + } - void enable_packrat_parsing() { - if (grammar_ != nullptr) { - auto &rule = (*grammar_)[start_]; - rule.enablePackratParsing = enablePackratParsing_; - } + void enable_left_recursion(bool enable = true) { + enableLeftRecursion_ = enable; + } + + void enable_packrat_parsing() { + if (grammar_ != nullptr) { + auto &rule = (*grammar_)[start_]; + rule.enablePackratParsing = enablePackratParsing_; } + } - void enable_trace(TracerEnter tracer_enter, TracerLeave tracer_leave) { - if (grammar_ != nullptr) { - auto &rule = (*grammar_)[start_]; - rule.tracer_enter = tracer_enter; - rule.tracer_leave = tracer_leave; - } + void enable_trace(TracerEnter tracer_enter, TracerLeave tracer_leave) { + if (grammar_ != nullptr) { + auto &rule = (*grammar_)[start_]; + rule.tracer_enter = tracer_enter; + rule.tracer_leave = tracer_leave; } + } - void enable_trace(TracerEnter tracer_enter, TracerLeave tracer_leave, - TracerStartOrEnd tracer_start, - TracerStartOrEnd tracer_end) { - if (grammar_ != nullptr) { - auto &rule = (*grammar_)[start_]; - rule.tracer_enter = tracer_enter; - rule.tracer_leave = tracer_leave; - rule.tracer_start = tracer_start; - rule.tracer_end = tracer_end; - } + void enable_trace(TracerEnter tracer_enter, TracerLeave tracer_leave, + TracerStartOrEnd tracer_start, + TracerStartOrEnd tracer_end) { + if (grammar_ != nullptr) { + auto &rule = (*grammar_)[start_]; + rule.tracer_enter = tracer_enter; + rule.tracer_leave = tracer_leave; + rule.tracer_start = tracer_start; + rule.tracer_end = tracer_end; } + } - void set_verbose_trace(bool verbose_trace) { - if (grammar_ != nullptr) { - auto &rule = (*grammar_)[start_]; - rule.verbose_trace = verbose_trace; - } + void set_verbose_trace(bool verbose_trace) { + if (grammar_ != nullptr) { + auto &rule = (*grammar_)[start_]; + rule.verbose_trace = verbose_trace; } + } - template parser &enable_ast() { - for (auto &[_, rule] : *grammar_) { - if (!rule.action) { add_ast_action(rule); } - } - return *this; + template parser &enable_ast() { + for (auto &[_, rule] : *grammar_) { + if (!rule.action) { add_ast_action(rule); } } + return *this; + } - template - std::shared_ptr optimize_ast(std::shared_ptr ast, - bool opt_mode = true) const { - return AstOptimizer(opt_mode, get_no_ast_opt_rules()).optimize(ast); - } + template + std::shared_ptr optimize_ast(std::shared_ptr ast, + bool opt_mode = true) const { + return AstOptimizer(opt_mode, get_no_ast_opt_rules()).optimize(ast); + } - void set_logger(Log log) { log_ = log; } + void set_logger(Log log) { log_ = log; } - void set_logger( - std::function - log) { - log_ = [log](size_t line, size_t col, const std::string &msg, - const std::string & /*rule*/) { log(line, col, msg); }; - } + void set_logger( + std::function + log) { + log_ = [log](size_t line, size_t col, const std::string &msg, + const std::string & /*rule*/) { log(line, col, msg); }; + } private: - bool post_process(const char *s, size_t n, Definition::Result &r) const { - if (log_ && !r.ret) { r.error_info.output_log(log_, s, n); } - return r.ret && !r.recovered; - } + bool post_process(const char *s, size_t n, Definition::Result &r) const { + if (log_ && !r.ret) { r.error_info.output_log(log_, s, n); } + return r.ret && !r.recovered; + } - std::vector get_no_ast_opt_rules() const { - std::vector rules; - for (auto &[name, rule] : *grammar_) { - if (rule.no_ast_opt) { rules.push_back(name); } - } - return rules; + std::vector get_no_ast_opt_rules() const { + std::vector rules; + for (auto &[name, rule] : *grammar_) { + if (rule.no_ast_opt) { rules.push_back(name); } } + return rules; + } - std::shared_ptr grammar_; - std::string start_; - bool enablePackratParsing_ = false; - Log log_; + std::shared_ptr grammar_; + std::string start_; + bool enableLeftRecursion_ = true; + bool enablePackratParsing_ = false; + Log log_; }; /*----------------------------------------------------------------------------- @@ -4780,59 +5601,59 @@ private: *---------------------------------------------------------------------------*/ inline void enable_tracing(parser &parser, std::ostream &os) { - parser.enable_trace( - [&](auto &ope, auto s, auto, auto &, auto &c, auto &, auto &trace_data) { - auto prev_pos = std::any_cast(trace_data); - auto pos = static_cast(s - c.s); - auto backtrack = (pos < prev_pos ? "*" : ""); - std::string indent; - auto level = c.trace_ids.size() - 1; - while (level--) { - indent += "│"; - } - std::string name; - { - name = peg::TraceOpeName::get(const_cast(ope)); + parser.enable_trace( + [&](auto &ope, auto s, auto, auto &, auto &c, auto &, auto &trace_data) { + auto prev_pos = std::any_cast(trace_data); + auto pos = static_cast(s - c.s); + auto backtrack = (pos < prev_pos ? "*" : ""); + std::string indent; + auto level = c.trace_ids.size() - 1; + while (level--) { + indent += "│"; + } + std::string name; + { + name = peg::TraceOpeName::get(const_cast(ope)); - auto lit = dynamic_cast(&ope); - if (lit) { name += " '" + peg::escape_characters(lit->lit_) + "'"; } - } - os << "E " << pos + 1 << backtrack << "\t" << indent << "┌" << name - << " #" << c.trace_ids.back() << std::endl; - trace_data = static_cast(pos); - }, - [&](auto &ope, auto s, auto, auto &sv, auto &c, auto &, auto len, - auto &) { - auto pos = static_cast(s - c.s); - if (len != static_cast(-1)) { pos += len; } - std::string indent; - auto level = c.trace_ids.size() - 1; - while (level--) { - indent += "│"; - } - auto ret = len != static_cast(-1) ? "└o " : "└x "; - auto name = peg::TraceOpeName::get(const_cast(ope)); - std::stringstream choice; - if (sv.choice_count() > 0) { - choice << " " << sv.choice() << "/" << sv.choice_count(); - } - std::string token; - if (!sv.tokens.empty()) { - token += ", token '"; - token += sv.tokens[0]; - token += "'"; - } - std::string matched; - if (peg::success(len) && - peg::TokenChecker::is_token(const_cast(ope))) { - matched = ", match '" + peg::escape_characters(s, len) + "'"; - } - os << "L " << pos + 1 << "\t" << indent << ret << name << " #" - << c.trace_ids.back() << choice.str() << token << matched - << std::endl; - }, - [&](auto &trace_data) { trace_data = static_cast(0); }, - [&](auto &) {}); + auto lit = dynamic_cast(&ope); + if (lit) { name += " '" + peg::escape_characters(lit->lit_) + "'"; } + } + os << "E " << pos + 1 << backtrack << "\t" << indent << "┌" << name + << " #" << c.trace_ids.back() << std::endl; + trace_data = static_cast(pos); + }, + [&](auto &ope, auto s, auto, auto &sv, auto &c, auto &, auto len, + auto &) { + auto pos = static_cast(s - c.s); + if (len != static_cast(-1)) { pos += len; } + std::string indent; + auto level = c.trace_ids.size() - 1; + while (level--) { + indent += "│"; + } + auto ret = len != static_cast(-1) ? "└o " : "└x "; + auto name = peg::TraceOpeName::get(const_cast(ope)); + std::stringstream choice; + if (sv.choice_count() > 0) { + choice << " " << sv.choice() << "/" << sv.choice_count(); + } + std::string token; + if (!sv.tokens.empty()) { + token += ", token '"; + token += sv.tokens[0]; + token += "'"; + } + std::string matched; + if (peg::success(len) && + peg::TokenChecker::is_token(const_cast(ope))) { + matched = ", match '" + peg::escape_characters(s, len) + "'"; + } + os << "L " << pos + 1 << "\t" << indent << ret << name << " #" + << c.trace_ids.back() << choice.str() << token << matched + << std::endl; + }, + [&](auto &trace_data) { trace_data = static_cast(0); }, + [&](auto &) {}); } /*----------------------------------------------------------------------------- @@ -4840,98 +5661,98 @@ inline void enable_tracing(parser &parser, std::ostream &os) { *---------------------------------------------------------------------------*/ inline void enable_profiling(parser &parser, std::ostream &os) { - struct Stats { - struct Item { - std::string name; - size_t success; - size_t fail; - }; - std::vector items; - std::map index; - size_t total = 0; - std::chrono::steady_clock::time_point start; + struct Stats { + struct Item { + std::string name; + size_t success; + size_t fail; }; + std::vector items; + std::map index; + size_t total = 0; + std::chrono::steady_clock::time_point start; + }; - parser.enable_trace( - [&](auto &ope, auto, auto, auto &, auto &, auto &, std::any &trace_data) { - if (auto holder = dynamic_cast(&ope)) { - auto &stats = *std::any_cast(trace_data); + parser.enable_trace( + [&](auto &ope, auto, auto, auto &, auto &, auto &, std::any &trace_data) { + if (auto holder = dynamic_cast(&ope)) { + auto &stats = *std::any_cast(trace_data); - auto &name = holder->name(); - if (stats.index.find(name) == stats.index.end()) { - stats.index[name] = stats.index.size(); - stats.items.push_back({name, 0, 0}); - } - stats.total++; + auto &name = holder->name(); + if (stats.index.find(name) == stats.index.end()) { + stats.index[name] = stats.index.size(); + stats.items.push_back({name, 0, 0}); + } + stats.total++; + } + }, + [&](auto &ope, auto, auto, auto &, auto &, auto &, auto len, + std::any &trace_data) { + if (auto holder = dynamic_cast(&ope)) { + auto &stats = *std::any_cast(trace_data); + + auto &name = holder->name(); + auto index = stats.index[name]; + auto &stat = stats.items[index]; + if (len != static_cast(-1)) { + stat.success++; + } else { + stat.fail++; + } + + if (index == 0) { + auto end = std::chrono::steady_clock::now(); + auto nano = std::chrono::duration_cast( + end - stats.start) + .count(); + auto sec = nano / 1000000.0; + os << "duration: " << sec << "s (" << nano << "µs)" << std::endl + << std::endl; + + char buff[BUFSIZ]; + size_t total_success = 0; + size_t total_fail = 0; + for (auto &[name, success, fail] : stats.items) { + total_success += success; + total_fail += fail; } - }, - [&](auto &ope, auto, auto, auto &, auto &, auto &, auto len, - std::any &trace_data) { - if (auto holder = dynamic_cast(&ope)) { - auto &stats = *std::any_cast(trace_data); - auto &name = holder->name(); - auto index = stats.index[name]; - auto &stat = stats.items[index]; - if (len != static_cast(-1)) { - stat.success++; - } else { - stat.fail++; - } + os << " id total % success fail " + "definition" + << std::endl; - if (index == 0) { - auto end = std::chrono::steady_clock::now(); - auto nano = std::chrono::duration_cast( - end - stats.start) - .count(); - auto sec = nano / 1000000.0; - os << "duration: " << sec << "s (" << nano << "µs)" << std::endl - << std::endl; + auto grand_total = total_success + total_fail; + snprintf(buff, BUFSIZ, "%4s %10zu %5s %10zu %10zu %s", "", + grand_total, "", total_success, total_fail, + "Total counters"); + os << buff << std::endl; - char buff[BUFSIZ]; - size_t total_success = 0; - size_t total_fail = 0; - for (auto &[name_, success, fail] : stats.items) { - total_success += success; - total_fail += fail; - } + snprintf(buff, BUFSIZ, "%4s %10s %5s %10.2f %10.2f %s", "", "", + "", total_success * 100.0 / grand_total, + total_fail * 100.0 / grand_total, "% success/fail"); + os << buff << std::endl << std::endl; + ; - os << " id total % success fail " - "definition" - << std::endl; - - auto grand_total = total_success + total_fail; - snprintf(buff, BUFSIZ, "%4s %10zu %5s %10zu %10zu %s", "", - grand_total, "", total_success, total_fail, - "Total counters"); - os << buff << std::endl; - - snprintf(buff, BUFSIZ, "%4s %10s %5s %10.2f %10.2f %s", "", "", - "", total_success * 100.0 / grand_total, - total_fail * 100.0 / grand_total, "% success/fail"); - os << buff << std::endl << std::endl; - ; - - size_t id = 0; - for (auto &[name_, success, fail] : stats.items) { - auto total = success + fail; - auto ratio = total * 100.0 / stats.total; - snprintf(buff, BUFSIZ, "%4zu %10zu %5.2f %10zu %10zu %s", id, - total, ratio, success, fail, name.c_str()); - os << buff << std::endl; - id++; - } - } + size_t id = 0; + for (auto &[name, success, fail] : stats.items) { + auto total = success + fail; + auto ratio = total * 100.0 / stats.total; + snprintf(buff, BUFSIZ, "%4zu %10zu %5.2f %10zu %10zu %s", id, + total, ratio, success, fail, name.c_str()); + os << buff << std::endl; + id++; } - }, - [&](auto &trace_data) { - auto stats = new Stats{}; - stats->start = std::chrono::steady_clock::now(); - trace_data = stats; - }, - [&](auto &trace_data) { - auto stats = std::any_cast(trace_data); - delete stats; - }); + } + } + }, + [&](auto &trace_data) { + auto stats = new Stats{}; + stats->start = std::chrono::steady_clock::now(); + trace_data = stats; + }, + [&](auto &trace_data) { + auto stats = std::any_cast(trace_data); + delete stats; + }); } } // namespace peg