User:2 B/Debatably useful stuff: Difference between revisions

From Pikipedia, the Pikmin wiki
Jump to navigation Jump to search
No edit summary
Line 1: Line 1:
==Python script to clean up files from the ''Pikmin 3'' text dump==
==Python script to clean up files from the ''Pikmin 3'' text dump==


  <nowiki># File from the text dump
  <nowiki># Change file here
filename = "Strings/Raw/Pikmin 3/EU ROM/EUEng/Chat.msbt"
filename = "Strings/Raw/Pikmin 3/EU ROM/EUEng/Chat.msbt"



Revision as of 10:55, January 21, 2025

Python script to clean up files from the Pikmin 3 text dump

# Change file here
filename = "Strings/Raw/Pikmin 3/EU ROM/EUEng/Chat.msbt"

# Change these for your purposes
textbox_separator = "\n<br>\n"
include_pauses = False
include_sound = False
include_autoadvance = False
include_lookat = False
include_textsize = True
include_variables = False
include_ruby = True

# Change text colors here
colors = {
    # emphasis
    0x01: "orange", 0x02: "#AA0", 0x03: "cyan", 0x26: "goldenrod",
    # pikmin colors
    0x0b: "red", 0x0e: "#AA0", 0x11: "blue", 0x14: "grey", 0x17: "pink",
    # leader colors
    0x1a: "lightblue", 0x1d: "pink", 0x20: "lime",
    # ultra-spicy
    0x25: "orange",}


def decode(bytes, byte_order = "little"):
    if byte_order == "big":
        return bytes.decode("utf-16-be")
    else:
        return bytes.decode("utf-16-le")

def clean_text(text, byte_order):
    characters = ("Alph", "Brittany", "Charlie", "Louie", "Olimar", "S.S. Drake", "President")
    emotes = ("neutral", "shocked", "happy", "sad", "mad")
    buttons = [
        '\\00', '\\01', '\\02', '\\03', '\\04', '\\05', '\\06', '\\07', '\\08', '\\09',
        'photo flash', 'photo',
        '\\0c', '\\0d', '\\0e', '\\0f', '\\10', '\\11', '\\12', '\\13',
        'roll right',
        '\\15',
        'roll left', 'photo zoom', 'use onion', 'move', 'throw', 'whistle', 'dismiss', 'change type',
        'reset camera', 'lock-on', 'charge', 'switch leader', 'spray',
        '\\23', '\\24', '\\25', '\\26', '\\27', '\\28', '\\29', '\\2a', '\\2b',
        'pluck', 'cancel',
        '\\2e',
        'shake',
    ]

    out = ""
    i = 0
    color = False
    while i < len(text):
        tmp = decode(text[i:i+2], byte_order)
        if decode(text[i:i+2], byte_order) == '\x0e':
            i += 2
            tag = decode(text[i:i+4], byte_order)
            extra_size = int.from_bytes(text[i+4:i+6], byte_order)
            i += 6
            if tag == '\x00\x00':
                if include_ruby:
                    tmp = text[i:i+4]
                    kanji_count = int.from_bytes(text[i:i+2], byte_order)
                    furigana_count = int.from_bytes(text[i+2:i+4], byte_order)
                    furigana = text[i+4 : i+4 + furigana_count]
                    kanji = text[i+4 + furigana_count : i+4 + furigana_count + kanji_count]
                    out += f"<ruby>{decode(kanji, byte_order)}<rt>{decode(furigana, byte_order)}</rt></ruby>"
                    i += kanji_count
                i += extra_size
                continue

            if tag == '\x00\x02':
                if include_textsize:
                    out += f"[text size: {int.from_bytes(text[i], byte_order)}]"
                i += extra_size
                continue

            if tag == '\x00\x03':
                if text[i:i+2] == b'\xff\xff':
                    # out += "[white]"
                    if color:
                        out += f"}}}}"
                    color = False
                elif int.from_bytes(text[i:i+2], byte_order) in colors:
                    # out += "[orange]"
                    if color:
                        out += f"}}}}"
                    out += f"{{{{color|2={colors[int.from_bytes(text[i:i+2], byte_order)]}|"
                    color = True
                else:
                    out += f"[unknown color: {hex(int.from_bytes(text[i:i+2], byte_order))}]"
                i += extra_size
                continue

            if tag == '\x00\x04':
                out += textbox_separator
                i += extra_size
                continue

            if tag[0] == '\x01':
                name_len = int.from_bytes(text[i:i+2], byte_order)
                var_name = decode(text[i+2 : i+2 + name_len], byte_order)
                assert(text[i+2 + name_len : i+2 + name_len + 4] == b"\x01\x00\x00\x00")
                if include_variables:
                    out += f"[variable: {hex(int.from_bytes(text[i-4:i-2]))} \"{var_name}\"]"
                else:
                    out += "_"
                i += extra_size
                continue

            if tag == '\x07\x01':
                assert(text[i+1] == 0xcd)
                button_id = text[i]
                button_name = hex(button_id)
                if button_id < len(buttons):
                    button_name = buttons[button_id]
                out += f"[button: {button_name}]"
                i += extra_size
                continue

            if tag == '\x15\x00':
                if include_sound:
                    detail = int.from_bytes(text[i:i+2])
                    high = detail >> 8
                    low = detail & 0x00FF
                    character = characters[high]
                    out += f"[{character} thinking sound]"
                i += extra_size
                continue

            if tag == '\x15\x01':
                if include_lookat:
                    detail = int.from_bytes(text[i:i+2])
                    character_name = characters[detail >> 8]
                    target = characters[detail & 0x00FF]
                    out += f"[look at: {character_name} -> {target}]"
                i += extra_size
                continue

            if tag == '\x17\x00':
                name_len = int.from_bytes(text[i:i+2], byte_order)
                name = text[i+2 : i+2 + name_len]
                out += f"[no text: \"{decode(name, byte_order)}\"]"
                i += extra_size
                continue

            if tag == '\x17\x01':
                if include_pauses:
                    out += f"[pause: {int.from_bytes(text[i:i+2], byte_order)}]"
                i += extra_size
                continue

            if tag == '\x17\x02':
                detail = int.from_bytes(text[i:i+2])
                high = detail >> 8
                low = detail & 0x00FF
                if low == 0x19:
                    character_name = characters[high]  
                    out += f"{character_name} (no icon): "
                elif low == 0x1a:
                    character_name = characters[high]  
                    out += f"{{{{icon|S.S. Drake}}}} "
                    if high != 5:
                        out += f"(speaker mismatch, should be {characters[high]}) "
                else:
                    if 0x1f <= low <= 0x23 :
                        character_id = 6
                        emote = emotes[low - 0x1f]
                    else:
                        character_id = low % 5 
                        emote = "neutral" if high == 5 else emotes[low // 5]
                    out += f"{{{{icon|{characters[character_id]}|v={emote}}}}} "
                    if high != character_id:
                        out += f"(speaker mismatch, should be {characters[high]}) "

                i += extra_size
                continue
            
            if tag == '\x17\x03':
                if include_autoadvance:
                    out += f"[advance automatically]"
                i += extra_size
                continue

            if tag == '\x17\x04':
                name_len = int.from_bytes(text[i:i+2], byte_order)
                sound_name = decode(text[i+2 : i+2 + name_len], byte_order)
                if include_sound:
                    out += f"[sound: \"{sound_name}\", {text[i+2 + name_len]}]"
                i += extra_size
                continue
            
            out += f"[unknown tag: {hex(ord(tag[0]))}, {hex(ord(tag[1]))}: \"{text[i+4:i+extra_size]}\"]"
            i += extra_size
            continue
        
        out += decode(text[i:i+2], byte_order)
        i += 2
    return out



with open(filename, 'rb') as f:
    text = f.read()
    byte_order_mark = text[8:10]
    byte_order = "big" if byte_order_mark == b'\xfe\xff' else "little"
    text = text[0x20:]

    blocks = {}

    while len(text) != 0:
        block_sign = text[:4].decode()
        text = text[4:]

        block_size = int.from_bytes(text[:4], byte_order)
        text = text[4:]
        text = text[8:]
        blocks[block_sign] = text[:block_size]
        text = text[block_size:]

        if len(text) == 0:
            break
        while text[0] == 0xab:
            text = text[1:]
            if len(text) == 0:
                break


    block = blocks["LBL1"]
    label_group_count = int.from_bytes(block[:4], byte_order)
    labels = []
    for group_id in range(label_group_count):
        label_count = int.from_bytes(block[4 + 8*group_id : 4 + 8*group_id + 4], byte_order)
        offset = int.from_bytes(block[4 + 8*group_id + 4 : 4 + 8*group_id + 8], byte_order)
        i = offset
        for label_id in range(label_count):
            label_size = int.from_bytes(block[i:i+1], byte_order)
            i += 1
            try:
                label = block[i : i + label_size].decode()
                i += label_size
                labels.append((label, int.from_bytes(block[i:i+4], byte_order)))
                i += 4
            except UnicodeDecodeError:
                i += 4

    block = blocks["TXT2"]
    message_count = int.from_bytes(block[:4], byte_order)
    message_offsets = []
    for message_id in range(message_count):
        offset = int.from_bytes(block[4 + 4*message_id : 4 + 4*message_id + 4], byte_order)
        message_offsets.append(offset)
    
    message_offsets.append(len(block))

    messages = {}
    for label, message_id in labels:
        if message_id > message_count:
            continue
        start_offset = message_offsets[message_id]
        end_offset = len(block)
        for offset in message_offsets:
            if offset > start_offset and offset < end_offset:
                end_offset = offset

        raw_message = block[start_offset:end_offset]
        message = ""
        for i in range(0,len(raw_message),2):
            message += chr(int.from_bytes(raw_message[i:i+2], byte_order))
        messages[label] = clean_text(raw_message, byte_order)[:-1]

    sorted_labels = sorted(messages.keys())

    for label in sorted_labels:
        print(f"=={label}==\n{messages[label]}\n")