JSON parser, lexer, reassembler initial test

This commit is contained in:
scientiist
2024-08-20 13:32:02 -05:00
parent 76de1b72ef
commit 058730fcd4
7 changed files with 492 additions and 6 deletions

View File

@@ -1,6 +1,33 @@
cmake_minimum_required(VERSION 3.28)
project(jjx)
cmake_minimum_required(VERSION 3.18...3.28)
project(jjx
VERSION 1.0
LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
if (PROJECT_SOURCE_DIR STREQUAL PROJECT_BINARY_DIR)
message(FATAL_ERROR "In-source builds are not allowed!")
endif()
add_executable(jjx main.cpp)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
#include(cmake/CPM.cmake)
file(GLOB_RECURSE HEADERS "include/*.hpp")
file(GLOB_RECURSE SOURCES "src/*.cpp")
include_directories("include")
if (UNIX)
add_library(jjx SHARED ${SOURCES})
endif()
if (WIN32)
add_library(jjx STATIC ${SOURCES})
endif()
add_executable(jjx_demo main.cpp)
target_link_libraries(jjx_demo PUBLIC jjx)

35
include/jjx.hpp Normal file
View File

@@ -0,0 +1,35 @@
#pragma once
#include <iostream>
#include <vector>
#include <memory>
#include <optional>
#include <map>
namespace jjx
{
namespace json {
enum class token_type { string, number, syntax, boolean, null };
enum class value_type { string, number, object, array, boolean, null};
struct token {
std::string value;
token_type type;
int location;
std::shared_ptr<std::string> full_source;
};
struct value {
std::optional<std::string> string;
std::optional<double> number;
std::optional<bool> boolean;
std::optional<std::vector<value>> array;
std::optional<std::map<std::string, value>> object;
value_type type;
};
std::tuple<std::vector<json::token>, std::string> lex(std::string);
std::tuple<json::value, int, std::string> parse(std::vector<json::token>, int index = 0);
std::tuple<json::value, std::string> parse(std::string);
std::string deparse(json::value, std::string whitespace = "");
}
namespace xml {}
}

View File

@@ -1,6 +1,22 @@
#include <jjx.hpp>
#include <iostream>
int main() {
std::cout << "Hello, World!" << std::endl;
using namespace jjx;
int main(int argc, char *argv[]) {
if (argc == 1) {
std::cerr << "Expected JSON input argument to parse" << std::endl;
return 1;
}
std::string in{argv[1]};
auto [ast, error] = json::parse(in);
if (error.size()) {
std::cerr << error << std::endl;
return 1;
}
std::cout << json::deparse(ast);
return 0;
}

26
samples/widgets.json Normal file
View File

@@ -0,0 +1,26 @@
{"widget": {
"debug": "on",
"window": {
"title": "Sample Konfabulator Widget",
"name": "main_window",
"width": 500,
"height": 500
},
"image": {
"src": "Images/Sun.png",
"name": "sun1",
"hOffset": 250,
"vOffset": 250,
"alignment": "center"
},
"text": {
"data": "Click Here",
"size": 36,
"style": "bold",
"name": "text1",
"hOffset": 250,
"vOffset": 100,
"alignment": "center",
"onMouseUp": "sun1.opacity = (sun1.opacity / 100) * 90;"
}
}}

3
src/jjx.cpp Normal file
View File

@@ -0,0 +1,3 @@
//
// Created by josh on 8/19/24.
//

378
src/json.cpp Normal file
View File

@@ -0,0 +1,378 @@
#include <jjx.hpp>
#include <sstream>
namespace jjx::json {
std::string format_error(std::string base, std::string source, int index) {
std::ostringstream s;
int counter = 0;
int line = 1;
int column = 0;
std::string lastline = "";
std::string whitespace = "";
for (auto c: source) {
if (counter == index) {
break;
}
if (c == '\n') {
line++;
column = 0;
lastline = "";
whitespace = "";
} else if (c == '\t') {
column++;
lastline += " ";
whitespace += " ";
} else {
column++;
lastline += c;
whitespace += " ";
}
counter++;
}
while (counter < source.size()) {
auto c = source[counter];
if (c == '\n') {
break;
}
lastline += c;
counter++;
}
// TODO: Migrate the below code bits to std::format
s << base << " at line " << line << ", column " << column << std::endl;
s << lastline << std::endl;
s << whitespace << "^";
return s.str();
}
int lex_whitespace(std::string raw_json, int index) {
while (std::isspace(raw_json[index])) {
if (index == raw_json.length()) {
break;
}
index++;
}
return index;
}
std::tuple<json::token, int, std::string> lex_syntax(std::string raw_json, int index)
{
json::token token{"", token_type::syntax, index};
std::string value = "";
auto c = raw_json[index];
if (c == '[' || c == ']' || c == '{' || c == '}' || c == ':' || c == ',') {
token.value += c;
index++;
}
return {token, index, ""};
}
std::tuple<json::token, int, std::string> lex_string(std::string raw_json, int original_index) {
int index = original_index;
json::token token {"", token_type::string, index};
std::string value = "";
auto c = raw_json[index];
if (c != '"') {
return {token, original_index, ""};
}
index++;
// TODO: handle nested quotes
while (c = raw_json[index], c != '"') {
if (index == raw_json.length()) {
return {token, index, format_error("Unexpected EOF while lexing string", raw_json, index)};
}
token.value += c;
index++;
}
index++;
return {token, index, ""};
}
std::tuple<json::token, int, std::string> lex_number(std::string raw_json, int original_index) {
int index = original_index;
json::token token {"", token_type::number, index};
std::string value = "";
// TODO: handle not just integers
while(true) {
if (index == raw_json.length()) {
break;
}
auto c = raw_json[index];
if (!(c >= '0' && c <= '9')) {
break;
}
token.value += c;
index++;
}
return {token, index, ""};
}
std::tuple<json::token, int, std::string> lex_keyword(std::string raw_json, std::string keyword, json::token_type type, int original_index) {
int index = original_index;
json::token token{"", type, index};
while (keyword[index - original_index] == raw_json[index]) {
if (index == raw_json.length()) {
break;
}
index++;
}
if (index - original_index == keyword.length()) {
token.value = keyword;
}
return {token, index, ""};
}
std::tuple<json::token, int, std::string> lex_null(std::string raw_json, int index)
{
return lex_keyword(raw_json, "null", token_type::null, index);
}
std::tuple<json::token, int, std::string> lex_true(std::string raw_json, int index)
{
return lex_keyword(raw_json, "true", token_type::boolean, index);
}
std::tuple<json::token, int, std::string> lex_false(std::string raw_json, int index) {
return lex_keyword(raw_json, "false", token_type::boolean, index);
}
std::tuple<std::vector<json::token>, std::string> lex(std::string raw_json) {
std::vector<json::token> tokens;
// All tokens will embed a pointer to the raw JSON for debugging purposes
auto original_copy = std::make_shared<std::string>(raw_json);
auto generic_lexers = {lex_syntax, lex_string, lex_number, lex_null, lex_true, lex_false};
for (int i = 0; i < raw_json.length(); i++) {
// Skip past whitespace
if (auto new_index = lex_whitespace(raw_json, i); i != new_index) {
i = new_index - 1;
continue;
}
auto found = false;
for (auto lexer: generic_lexers) {
if (auto [token, new_index, error] = lexer(raw_json, i); i != new_index) {
// Error while lexing, return early
if (error.length()) {
return {{}, error};
}
// Store reference to the original source
token.full_source = original_copy;
tokens.push_back(token);
i = new_index - 1;
found = true;
break;
}
}
if (found) {
continue;
}
return {{}, format_error("Unable to lex", raw_json, i)};
}
return {tokens, ""};
}
// It's very annoying when languages doesn't give you
// stringifier methods for enums by default for debugging.
// There are ways to do this with reflection but it seems hairy.
// There's a better procedure IIRC.
std::string token_type_tostring(token_type tok)
{
switch(tok) {
case token_type::string: return "String";
case token_type::number: return "Number";
case token_type::syntax: return "Syntax";
case token_type::boolean: return "Boolean";
case token_type::null: return "Null";
}
}
std::string format_parse_error(std::string base, json::token token)
{
std::ostringstream s;
s << "Unexpected token '" << token.value << "', type' '"
<< token_type_tostring(token.type) << "', index'";
s << std::endl << base;
return format_error(s.str(), *token.full_source, token.location);
}
std::tuple<std::vector<json::value>, int, std::string> parse_array(std::vector<json::token> tokens, int index) {
std::vector<json::value> children = {};
while (index < tokens.size()) {
auto t = tokens[index];
if (t.type == token_type::syntax) {
if (t.value == "]") {
return {children, index + 1, ""};
}
if (t.value == ",") {
index++;
t = tokens[index];
} else if (children.size() > 0) {
return {{}, index,
format_parse_error("Expected comma after element in array", t)};
}
}
auto [child, new_index, error] = parse(tokens, index);
if (error.size()) { return {{}, index, error}; }
children.push_back(child);
index = new_index;
}
return {
{}, index,
format_parse_error("Unexpected EOF while parsing array", tokens[index])};
}
std::tuple<std::map<std::string, value>, int, std::string> parse_object(std::vector<token> tokens, int index) {
std::map<std::string, value> values = {};
while (index < tokens.size()) {
auto t = tokens[index];
if (t.type == token_type::syntax) {
if (t.value == "}") {
return {values, index + 1, ""};
}
if (t.value == ",") {
index++;
t = tokens[index];
} else if (values.size() > 0) {
return {{}, index,
format_parse_error("Expected comma after element in object", t)
};
} else {
return {{}, index,
format_parse_error("Expected key-value pair or closing brace in object", t)
};
}
}
auto [key, new_index, error] = parse(tokens, index);
if (error.size())
{
return {{}, index, error};
}
if (key.type != value_type::string) {
return {{}, index,
format_parse_error("Expected string key in object", t)};
}
index = new_index;
t = tokens[index];
if (!(t.type == token_type::syntax && t.value == ":")) {
return {{}, index,
format_parse_error("Expected colon after key in object", t)};
}
index++;
t = tokens[index];
auto [value, new_index1, error1] = parse(tokens, index);
if (error1.size()) {
return {{}, index, error1};
}
values[key.string.value()] = value;
index = new_index1;
}
return {values, index+1, ""};
}
std::tuple<json::value, int, std::string> parse(std::vector<json::token> tokens, int index) {
auto token = tokens[index];
switch(token.type) {
case token_type::number: {
auto n = std::stod(token.value);
return {json::value{.number = n, .type = value_type::number}, index+1, ""};
}
case token_type::boolean:
return {json::value{.boolean = token.value == "true", .type = value_type::boolean}, index + 1, ""};
case token_type::null:
return {json::value{.type = value_type::null}, index+1, ""};
case token_type::string:
return {json::value{.string = token.value, .type = value_type::string}, index+1, ""};
case token_type::syntax: {
if (token.value == "[") {
auto [array, new_index, error] = parse_array(tokens, index + 1);
return {json::value{.array = array, .type = value_type::array}, new_index, error};
}
if (token.value == "{") {
auto [object, new_index, error] = parse_object(tokens, index + 1);
return {json::value{.object = std::optional(object), .type = value_type::object}, new_index, error};
}
}
default:
return {{}, index, format_parse_error("Failed to parse", token)};
}
}
std::tuple<json::value, std::string> parse(std::string source) {
auto [tokens, error] = json::lex(source);
if (error.size())
{
return {{}, error};
}
auto [ast, _, error1] = json::parse(tokens);
return {ast, error1};
}
std::string deparse(json::value v, std::string whitespace) {
switch(v.type) {
case json::value_type::string:
return "\"" + v.string.value() + "\"";
case json::value_type::boolean:
return (v.boolean.value() ? "true" : "false");
case json::value_type::number:
return std::to_string(v.number.value());
case json::value_type::null:
return "null";
case json::value_type::array: {
std::string s = "[\n";
auto a = v.array.value();
for (int i = 0; i < a.size(); i++) {
auto value = a[i];
s += whitespace + " " + deparse(value, whitespace + " ");
if (i < a.size() - 1) {
s += ",";
}
s += "\n";
}
return s + whitespace + "]";
}
case json::value_type::object: {
std::string s = "{\n";
auto values = v.object.value();
auto i = 0;
for (auto const &[key, value] : values) {
s += whitespace + " " + "\"" + key + "\":" + deparse(value, whitespace + " ");
if (i < values.size() - 1) {
s += ",";
}
s += "\n";
i++;
}
return s + whitespace + "}";
}
}
}
}

1
src/xml.cpp Normal file
View File

@@ -0,0 +1 @@
#include "../include/jjx.hpp"