lib/nbt: Implement parser

resources/NBT_data: Add two files that are valid NBT by our extended spec
Since we allow loose tags to be valid NBT data, this is valid NBT by our spec. This isn’t valid by Mojang’s spec.
2022-10-15 23:05:26 +02:00 · 2022-10-15 21:41:32 +02:00 · 2022-10-15 18:55:58 +02:00
4 changed files with 401 additions and 14 deletions
--- a/resources/NBT_data/bare_int64_tag
+++ b/resources/NBT_data/bare_int64_tag
--- a/resources/NBT_data/bare_int64_tag_and_int32_tag
+++ b/resources/NBT_data/bare_int64_tag_and_int32_tag
--- a/src/lib/nbt.cpp
+++ b/src/lib/nbt.cpp
@ -1168,6 +1168,371 @@ namespace NBT {

    }

+    // the same comment about blindly passing up error codes applies to this function
+    // FIXME: memory leak when returning errors
+    ErrorOr<std::vector<Tag::Generic*>> deserializeRawListContents(uint8_t data[], uint64_t dataSize, uint64_t initialPosition, uint64_t* processedDataSize) {
+        std::vector<Tag::Generic*> contents;
+        // get contained data length by reading it manually because
+        // the function that does it normally can't deal with
+        // headerless tags
+        //
+        // add one byte to position to skip the type byte
+        ErrorOr<int32_t> elementCount = Helper::readInt32(data, dataSize, initialPosition+1);
+        if (elementCount.isError) {
+            return ErrorOr<std::vector<Tag::Generic*>>(true, elementCount.errorCode);
+        }
+
+        uint8_t contentType = data[initialPosition];
+        // contained type byte + 4 length bytes = 5
+        *processedDataSize = 5;
+        switch (contentType) {
+            case TagType::END: {
+                // everything except content has been touched at this point
+                // and a list of end tags has no content that could be read
+                for (int32_t i=0; i<elementCount.value; i++) {
+                    contents.push_back(new Tag::End());
+                }
+                break;
+            }
+            case TagType::INT8: {
+                for (int32_t i=0; i<elementCount.value; i++) {
+                    ErrorOr<int8_t> nextInt = Helper::readInt8(data, dataSize, initialPosition+*processedDataSize);
+                    if (nextInt.isError) {
+                        return ErrorOr<std::vector<Tag::Generic*>>(true, nextInt.errorCode);
+                    }
+                    contents.push_back(new Tag::Int8("", nextInt.value));
+                    // The below code would produce a warning on GCC and Clang
+                    // about the computed value not being used. While this does
+                    // apply inside this function, it is ultimately not true
+                    // as the pointer is used both inside and outside of the
+                    // function.
+                    *processedDataSize += 1;
+                }
+                break;
+            }
+            case TagType::INT16: {
+                for (int32_t i=0; i<elementCount.value; i++) {
+                    ErrorOr<int16_t> nextInt = Helper::readInt16(data, dataSize, initialPosition+*processedDataSize);
+                    if (nextInt.isError) {
+                        return ErrorOr<std::vector<Tag::Generic*>>(true, nextInt.errorCode);
+                    }
+                    contents.push_back(new Tag::Int16("", nextInt.value));
+                    *processedDataSize += 2;
+                }
+                break;
+            }
+            case TagType::INT32: {
+                for (int32_t i=0; i<elementCount.value; i++) {
+                    ErrorOr<int32_t> nextInt = Helper::readInt32(data, dataSize, initialPosition+*processedDataSize);
+                    if (nextInt.isError) {
+                        return ErrorOr<std::vector<Tag::Generic*>>(true, nextInt.errorCode);
+                    }
+                    contents.push_back(new Tag::Int32("", nextInt.value));
+                    *processedDataSize += 4;
+                }
+                break;
+            }
+            case TagType::FLOAT: {
+                for (int32_t i=0; i<elementCount.value; i++) {
+                    ErrorOr<float> nextFloat = Helper::readFloat(data, dataSize, initialPosition+*processedDataSize);
+                    if (nextFloat.isError) {
+                        return ErrorOr<std::vector<Tag::Generic*>>(true, nextFloat.errorCode);
+                    }
+                    contents.push_back(new Tag::Float("", nextFloat.value));
+                    *processedDataSize += 4;
+                }
+                break;
+            }
+            case TagType::INT64: {
+                for (int32_t i=0; i<elementCount.value; i++) {
+                    ErrorOr<int64_t> nextInt = Helper::readInt64(data, dataSize, initialPosition+*processedDataSize);
+                    if (nextInt.isError) {
+                        return ErrorOr<std::vector<Tag::Generic*>>(true, nextInt.errorCode);
+                    }
+                    contents.push_back(new Tag::Int64("", nextInt.value));
+                    *processedDataSize += 8;
+                }
+                break;
+            }
+            case TagType::DOUBLE: {
+                for (int32_t i=0; i<elementCount.value; i++) {
+                    ErrorOr<double> nextDouble = Helper::readDouble(data, dataSize, initialPosition+*processedDataSize);
+                    if (nextDouble.isError) {
+                        return ErrorOr<std::vector<Tag::Generic*>>(true, nextDouble.errorCode);
+                    }
+                    contents.push_back(new Tag::Double("", nextDouble.value));
+                    *processedDataSize += 8;
+                }
+                break;
+            }
+            case TagType::INT8_ARRAY: {
+                for (int32_t i=0; i<elementCount.value; i++) {
+                    ErrorOr<std::vector<int8_t>> nextArray = Helper::readInt8Array(data, dataSize, initialPosition+*processedDataSize);
+                    if (nextArray.isError) {
+                        return ErrorOr<std::vector<Tag::Generic*>>(true, nextArray.errorCode);
+                    }
+                    contents.push_back(new Tag::Int8Array("", nextArray.value));
+                    *processedDataSize += (uint64_t) nextArray.value.size();
+                }
+                break;
+            }
+            case TagType::STRING: {
+                for (int32_t i=0; i<elementCount.value; i++) {
+                    ErrorOr<tiny_utf8::string> nextString = Helper::readString(data, dataSize, initialPosition+*processedDataSize);
+                    if (nextString.isError) {
+                        return ErrorOr<std::vector<Tag::Generic*>>(true, nextString.errorCode);
+                    }
+                    contents.push_back(new Tag::String("", nextString.value));
+                    // this cannot be an error because it just got read
+                    int16_t nextStringSize = Helper::readInt16(data, dataSize, initialPosition+*processedDataSize).value;
+                    *processedDataSize += (uint64_t) nextStringSize + 2;
+                }
+                break;
+            }
+            case TagType::LIST: {
+                uint64_t* containedDataSize = new uint64_t;
+                for (int32_t i=0; i<elementCount.value; i++) {
+                    *containedDataSize = 0;
+
+                    ErrorOr<std::vector<Tag::Generic*>> nextListContents = deserializeRawListContents(data, dataSize, initialPosition+*processedDataSize, containedDataSize);
+                    if (nextListContents.isError) {
+                        delete containedDataSize;
+                        return ErrorOr<std::vector<Tag::Generic*>>(true, nextListContents.errorCode);
+                    }
+                    contents.push_back(new Tag::List("", nextListContents.value));
+                    *processedDataSize += *containedDataSize;
+                }
+                delete containedDataSize;
+                break;
+            }
+            case TagType::COMPOUND: {
+                uint64_t* containedDataSize = new uint64_t;
+                for (int32_t i=0; i<elementCount.value; i++) {
+                    *containedDataSize = 0;
+                    ErrorOr<std::vector<Tag::Generic*>> nextCompoundData = deserialize(data, dataSize, initialPosition+*processedDataSize, containedDataSize);
+                    if (nextCompoundData.isError) {
+                        delete containedDataSize;
+                        return ErrorOr<std::vector<Tag::Generic*>>(true, nextCompoundData.errorCode);
+                    }
+                    contents.push_back(new Tag::Compound("", nextCompoundData.value));
+                    *processedDataSize += *containedDataSize;
+                }
+                delete containedDataSize;
+                break;
+            }
+            case TagType::INT32_ARRAY: {
+                for (int32_t i=0; i<elementCount.value; i++) {
+                    ErrorOr<std::vector<int32_t>> nextArray = Helper::readInt32Array(data, dataSize, initialPosition+*processedDataSize);
+                    if (nextArray.isError) {
+                        return ErrorOr<std::vector<Tag::Generic*>>(true, nextArray.errorCode);
+                    }
+                    contents.push_back(new Tag::Int32Array("", nextArray.value));
+                    *processedDataSize += (uint64_t) nextArray.value.size() * 4;
+                }
+                break;
+            }
+            case TagType::INT64_ARRAY: {
+                for (int32_t i=0; i<elementCount.value; i++) {
+                    ErrorOr<std::vector<int64_t>> nextArray = Helper::readInt64Array(data, dataSize, initialPosition+*processedDataSize);
+                    if (nextArray.isError) {
+                        return ErrorOr<std::vector<Tag::Generic*>>(true, nextArray.errorCode);
+                    }
+                    contents.push_back(new Tag::Int64Array("", nextArray.value));
+                    *processedDataSize += (uint64_t) nextArray.value.size() * 8;
+                }
+                break;
+            }
+            default:
+                return ErrorOr<std::vector<Tag::Generic*>>(true, ErrorCodes::INVALID_TYPE);
+        }
+        return ErrorOr<std::vector<Tag::Generic*>>(contents);
+    }
+
+    // comment about blindly passing up error codes applies here
+    //
+    // The return value of this function is a vector of tags
+    // instead of a compound tag due to a spec extension that allows
+    // for any bare tag to be valid NBT data without a containing
+    // compound tag. This also just makes the implementation easier.
+    ErrorOr<std::vector<Tag::Generic*>> deserialize(uint8_t data[], uint64_t dataSize, uint64_t initialPosition, uint64_t* processedDataSize){
+        if (initialPosition >= dataSize) {
+            if (processedDataSize!=nullptr) *processedDataSize=0;
+            return ErrorOr<std::vector<Tag::Generic*>>(true, ErrorCodes::OUT_OF_RANGE);
+
+            // An interesting question at this point is whether we should
+            // consider empty input valid or invalid NBT data.
+            //
+            // The original spec says that the top-most tag is always a
+            // compound (or in more recent times, the Microsoft-commercialized
+            // in-game-purchase-enabling version also allows list tags)
+            // which automatically means that no data is invalid data...
+            // I don't see a reason why having a different tag as the top-most
+            // tag shouldn't be valid NBT in which case we have to face the
+            // question whether no data is invalid or just empty NBT data.
+            //
+            // This seems like a reasonable extension to the spec to me and
+            // it should be backwards compatible AFAIK.
+            //
+            // - BodgeMaster
+        }
+
+        std::vector<Tag::Generic*> tags = std::vector<Tag::Generic*>();
+        ErrorOr<std::vector<Tag::Generic*>> returnValue;
+        uint64_t currentPosition = initialPosition;
+        while (currentPosition<dataSize) {
+            ErrorOr<uint64_t> nextTagSize = Helper::totalTagSize(data, dataSize, currentPosition);
+            if (nextTagSize.isError) {
+                if (nextTagSize.errorCode == ErrorCodes::NOT_YET_KNOWN) {
+                    ErrorOr<tiny_utf8::string> tagName = Helper::readString(data, dataSize, currentPosition+1);
+                    if (tagName.isError) {
+                        returnValue = ErrorOr<std::vector<Tag::Generic*>>(true, tagName.errorCode);
+                        goto returnNow;
+                    }
+
+                    // used seek to the start of the list's/compound’s contents
+                    //
+                    // there is no way this is an error bc it gets
+                    // checked while trying to parse the string above
+                    int16_t nameSize = Helper::readInt16(data, dataSize, currentPosition+1).value;
+
+                    uint64_t* processedTagSize = new uint64_t;
+                    *processedTagSize = 0;
+
+                    if (data[currentPosition]==TagType::LIST) {
+                        // type byte + two name size bytes = 3
+                        ErrorOr<std::vector<Tag::Generic*>> listData = deserializeRawListContents(data, dataSize, currentPosition + (uint64_t) nameSize + 3, processedTagSize);
+                        if (listData.isError) {
+                            delete processedTagSize;
+                            returnValue = ErrorOr<std::vector<Tag::Generic*>>(true, listData.errorCode);
+                            goto returnNow;
+                        }
+                        tags.push_back(new Tag::List(tagName.value, listData.value));
+                        *processedTagSize += (uint64_t) nameSize + 3;
+                    }
+                    if (data[currentPosition]==TagType::COMPOUND) {
+                        // type byte + two name size bytes = 3
+                        ErrorOr<std::vector<Tag::Generic*>> compoundData = deserialize(data, dataSize, currentPosition + (uint64_t) nameSize + 3, processedTagSize);
+                        if (compoundData.isError) {
+                            delete processedTagSize;
+                            returnValue = ErrorOr<std::vector<Tag::Generic*>>(true, compoundData.errorCode);
+                            goto returnNow;
+                        }
+                        tags.push_back(new Tag::Compound(tagName.value, compoundData.value));
+                        *processedTagSize += (uint64_t) nameSize + 3;
+                    }
+                    currentPosition += *processedTagSize;
+
+                    delete processedTagSize;
+                    continue;
+                }
+                returnValue = ErrorOr<std::vector<Tag::Generic*>>(true, nextTagSize.errorCode);
+                goto returnNow;
+            }
+
+            if (currentPosition + nextTagSize.value > dataSize) {
+                returnValue = ErrorOr<std::vector<Tag::Generic*>>(true, ErrorCodes::OVERRUN);
+                goto returnNow;
+            }
+
+            // recursion abort condition
+            if (data[currentPosition]==TagType::END) {
+                // not appending an end tag as it is built into
+                // the compound anyway
+                currentPosition++;
+                returnValue = ErrorOr<std::vector<Tag::Generic*>>(tags);
+                goto returnNow;
+            }
+
+            // nameSize cannot be an error here bc it got checked in
+            // nextTagSize() already
+            int16_t nameSize = Helper::readInt16(data, dataSize, currentPosition+1).value;
+
+            ErrorOr<tiny_utf8::string> name = Helper::readString(data, dataSize, currentPosition+1);
+            if (name.isError) {
+                returnValue = ErrorOr<std::vector<Tag::Generic*>>(true, name.errorCode);
+                goto returnNow;
+            }
+
+            // Overrun / out of range errors have already been ruled out by
+            // checking the tag size against the total amount of data.
+            switch (data[currentPosition]) {
+                case TagType::INT8: {
+                    int8_t content = Helper::readInt8(data, dataSize, currentPosition+nameSize+3).value;
+                    tags.push_back(new Tag::Int8(name.value, content));
+                    break;
+                }
+                case TagType::INT16: {
+                    int16_t content = Helper::readInt16(data, dataSize, currentPosition+nameSize+3).value;
+                    tags.push_back(new Tag::Int16(name.value, content));
+                    break;
+                }
+                case TagType::INT32: {
+                    int32_t content = Helper::readInt32(data, dataSize, currentPosition+nameSize+3).value;
+                    tags.push_back(new Tag::Int32(name.value, content));
+                    break;
+                }
+                case TagType::INT64: {
+                    int64_t content = Helper::readInt64(data, dataSize, currentPosition+nameSize+3).value;
+                    tags.push_back(new Tag::Int64(name.value, content));
+                    break;
+                }
+                case TagType::FLOAT: {
+                    float content = Helper::readFloat(data, dataSize, currentPosition+nameSize+3).value;
+                    tags.push_back(new Tag::Float(name.value, content));
+                    break;
+                }
+                case TagType::DOUBLE: {
+                    double content = Helper::readDouble(data, dataSize, currentPosition+nameSize+3).value;
+                    tags.push_back(new Tag::Double(name.value, content));
+                    break;
+                }
+                case TagType::INT8_ARRAY: {
+                    std::vector<int8_t> content = Helper::readInt8Array(data, dataSize, currentPosition+nameSize+3).value;
+                    tags.push_back(new Tag::Int8Array(name.value, content));
+                    break;
+                }
+                case TagType::STRING: {
+                    ErrorOr<tiny_utf8::string> content = Helper::readString(data, dataSize, currentPosition+nameSize+3);
+                    if (content.isError) {
+                        returnValue = ErrorOr<std::vector<Tag::Generic*>>(true, content.errorCode);
+                        goto returnNow;
+                    }
+                    tags.push_back(new Tag::String(name.value, content.value));
+                    break;
+                }
+                case TagType::INT32_ARRAY: {
+                    std::vector<int32_t> content = Helper::readInt32Array(data, dataSize, currentPosition+nameSize+3).value;
+                    tags.push_back(new Tag::Int32Array(name.value, content));
+                    break;
+                }
+                case TagType::INT64_ARRAY: {
+                    std::vector<int64_t> content = Helper::readInt64Array(data, dataSize, currentPosition+nameSize+3).value;
+                    tags.push_back(new Tag::Int64Array(name.value, content));
+                    break;
+                }
+                default: {
+                    returnValue = ErrorOr<std::vector<Tag::Generic*>>(true, ErrorCodes::UNKNOWN);
+                    goto returnNow;
+                }
+            }
+
+            currentPosition += nextTagSize.value;
+        }
+        returnValue = ErrorOr<std::vector<Tag::Generic*>>(tags);
+        goto returnNow;
+
+        returnNow:
+            if (processedDataSize!=nullptr) {
+                *processedDataSize = currentPosition-initialPosition;
+            }
+            if (returnValue.isError) {
+                for (uint64_t i=0; i<tags.size(); i++) {
+                    delete tags[i];
+                }
+            }
+            return returnValue;
+    }
+
    bool validateRawListContents(uint8_t data[], uint64_t dataSize, uint64_t initialPosition, uint64_t* processedDataSize) {
        // get contained data length by reading it manually because
        // the function that does it normally can't deal with
@ -1307,8 +1672,8 @@ namespace NBT {
            // - BodgeMaster
        }

+        bool returnValue;
        uint64_t currentPosition = initialPosition;
-        #define return if (processedDataSize!=nullptr) *processedDataSize = currentPosition-initialPosition; return
        while (currentPosition<dataSize) {
            ErrorOr<uint64_t> nextTagSize = Helper::totalTagSize(data, dataSize, currentPosition);
            if (nextTagSize.isError) {
@ -1316,7 +1681,8 @@ namespace NBT {
                    // attempt parsing the name
                    ErrorOr<tiny_utf8::string> tagName = Helper::readString(data, dataSize, currentPosition+1);
                    if (tagName.isError) {
-                        return false;
+                        returnValue = false;
+                        goto returnNow;
                    }

                    // used seek to the start of the list's/compound’s contents
@ -1332,7 +1698,8 @@ namespace NBT {
                        // type byte + two name size bytes = 3
                        if (!validateRawListContents(data, dataSize, currentPosition + (uint64_t) nameSize + 3, processedTagSize)) {
                            delete processedTagSize;
-                            return false;
+                            returnValue = false;
+                            goto returnNow;
                        }
                        *processedTagSize += (uint64_t) nameSize + 3;
                    }
@ -1340,7 +1707,8 @@ namespace NBT {
                        // type byte + two name size bytes = 3
                        if (!validateRawNBTData(data, dataSize, currentPosition + (uint64_t) nameSize + 3, processedTagSize)) {
                            delete processedTagSize;
-                            return false;
+                            returnValue = false;
+                            goto returnNow;
                        }
                        *processedTagSize += (uint64_t) nameSize + 3;
                    }
@ -1349,17 +1717,20 @@ namespace NBT {
                    delete processedTagSize;
                    continue;
                }
-                return false;
+                returnValue = false;
+                goto returnNow;
            }

            if (currentPosition + nextTagSize.value > dataSize) {
-                return false;
+                returnValue = false;
+                goto returnNow;
            }

            // recursion abort condition
            if (data[currentPosition]==TagType::END) {
                currentPosition++;
-                return true;
+                returnValue = true;
+                goto returnNow;
            }

            // nameSize cannot be an error here bc it got checked in
@ -1371,11 +1742,15 @@ namespace NBT {
            // This shouldn't matter too much here as the only error condition
            // the parser function deals with rn is an overrun which is already
            // being guarded against with
-            // if (currentPosition + nextTagSize.value > dataSize) return false;
+            // if (currentPosition + nextTagSize.value > dataSize) {
+            //     returnValue = false;
+            //     goto returnNow;
+            // }
            // It might, however, turn out to be a useful check in the future.
            ErrorOr<tiny_utf8::string> name = Helper::readString(data, dataSize, currentPosition+1);
            if (name.isError) {
-                return false;
+                returnValue = false;
+                goto returnNow;
            }

            switch (data[currentPosition]) {
@ -1393,14 +1768,18 @@ namespace NBT {
                    // This shouldn't matter too much here as the only
                    // error condition the parser function deals with rn is
                    // an overrun which is already being guarded against with
-                    // if (currentPosition + nextTagSize.value > dataSize) return false;
+                    // if (currentPosition + nextTagSize.value > dataSize) {
+                    //     returnValue = false;
+                    //     goto returnNow;
+                    // }
                    // It might, however, turn out to be a useful check
                    // in the future.
                    //
                    // type byte + two name size bytes = 3
                    ErrorOr<tiny_utf8::string> content = Helper::readString(data, dataSize, currentPosition+nameSize+3);
                    if (content.isError) {
-                        return false;
+                        returnValue = false;
+                        goto returnNow;
                    }
                    break;
                }
@ -1408,12 +1787,19 @@ namespace NBT {
                case TagType::INT64_ARRAY:
                    break;
                default:
-                    return false;
+                    returnValue = false;
+                    goto returnNow;
            }

            currentPosition += nextTagSize.value;
        }
-        return true;
-        #undef return
+        returnValue = true;
+        goto returnNow;
+
+        returnNow:
+            if (processedDataSize!=nullptr) {
+                *processedDataSize = currentPosition-initialPosition;
+            }
+            return returnValue;
    }
 }
--- a/src/lib/nbt.hpp
+++ b/src/lib/nbt.hpp
@ -301,5 +301,6 @@ namespace NBT {
        };
    }

+    ErrorOr<std::vector<Tag::Generic*>> deserialize(uint8_t data[], uint64_t dataSize, uint64_t initialPosition=0, uint64_t* processedDataSize=nullptr);
    bool validateRawNBTData(uint8_t data[], uint64_t dataSize, uint64_t initialPosition=0, uint64_t* processedDataSize=nullptr);
 }
Author	SHA1	Message	Date
BodgeMaster	cdc23e7468	lib/nbt: Implement parser	2022-10-15 23:05:26 +02:00
BodgeMaster	e9bfb6eeee	resources/NBT_data: Add two files that are valid NBT by our extended spec Since we allow loose tags to be valid NBT data, this is valid NBT by our spec. This isn’t valid by Mojang’s spec.	2022-10-15 21:41:32 +02:00
BodgeMaster	8b62ec9c88	lib/nbt: Get rid of that ugly #define return hack Instead of doing #define return, the boolean returnValue is set and a goto statement is used to get to the code that does what the macro used to do.	2022-10-15 18:55:58 +02:00