User:2 B/Debatably useful stuff: Difference between revisions
Jump to navigation
Jump to search
No edit summary |
No edit summary |
||
Line 2: | Line 2: | ||
<nowiki># File from the text dump | <nowiki># File from the text dump | ||
filename = "Strings/ | filename = "Strings/Raw/Pikmin 3/EU ROM/EUEng/Chat.msbt" | ||
# Change these for your purposes | # Change these for your purposes | ||
textbox_separator = "\n | textbox_separator = "\n<br>\n" | ||
include_pauses = False | include_pauses = False | ||
include_sound = False | include_sound = False | ||
include_autoadvance = | include_autoadvance = False | ||
include_lookat = | include_lookat = False | ||
include_textsize = True | include_textsize = True | ||
include_variables = False | include_variables = False | ||
include_ruby = True | include_ruby = True | ||
# Change text colors here | |||
colors = { | |||
# emphasis | |||
0x01: "orange", 0x02: "#AA0", 0x03: "cyan", 0x26: "goldenrod", | |||
# pikmin colors | |||
0x0b: "red", 0x0e: "#AA0", 0x11: "blue", 0x14: "grey", 0x17: "pink", | |||
# leader colors | |||
0x1a: "lightblue", 0x1d: "pink", 0x20: "lime", | |||
# ultra-spicy | |||
0x25: "orange",} | |||
def clean_text( | def decode(bytes, byte_order = "little"): | ||
characters = ("Alph", "Brittany", "Charlie", "Louie", "Olimar", "S.S. Drake") | if byte_order == "big": | ||
return bytes.decode("utf-16-be") | |||
else: | |||
return bytes.decode("utf-16-le") | |||
def clean_text(text, byte_order): | |||
characters = ("Alph", "Brittany", "Charlie", "Louie", "Olimar", "S.S. Drake", "President") | |||
emotes = ("neutral", "shocked", "happy", "sad", "mad") | emotes = ("neutral", "shocked", "happy", "sad", "mad") | ||
buttons = [ | buttons = [ | ||
'\\00', '\\01', '\\02', '\\03', '\\04', '\\05', '\\06', '\\07', '\\08', '\\09', | '\\00', '\\01', '\\02', '\\03', '\\04', '\\05', '\\06', '\\07', '\\08', '\\09', | ||
Line 39: | Line 49: | ||
] | ] | ||
out = "" | out = "" | ||
i = 0 | i = 0 | ||
color = False | color = False | ||
while i < len(text): | while i < len(text): | ||
if text[i] == '\x0e': | tmp = decode(text[i:i+2], byte_order) | ||
i += | if decode(text[i:i+2], byte_order) == '\x0e': | ||
tag = text[i:i+ | i += 2 | ||
extra_size = | tag = decode(text[i:i+4], byte_order) | ||
i += | extra_size = int.from_bytes(text[i+4:i+6], byte_order) | ||
i += 6 | |||
if tag == '\x00\x00': | if tag == '\x00\x00': | ||
if include_ruby: | if include_ruby: | ||
kanji_count = | tmp = text[i:i+4] | ||
furigana_count = | kanji_count = int.from_bytes(text[i:i+2], byte_order) | ||
furigana = text[i+ | furigana_count = int.from_bytes(text[i+2:i+4], byte_order) | ||
kanji = text[i+ | furigana = text[i+4 : i+4 + furigana_count] | ||
out += f"<ruby>{kanji}<rt>{furigana}</rt></ruby>" | kanji = text[i+4 + furigana_count : i+4 + furigana_count + kanji_count] | ||
out += f"<ruby>{decode(kanji, byte_order)}<rt>{decode(furigana, byte_order)}</rt></ruby>" | |||
i += kanji_count | i += kanji_count | ||
i += extra_size | i += extra_size | ||
Line 62: | Line 73: | ||
if tag == '\x00\x02': | if tag == '\x00\x02': | ||
if include_textsize: | if include_textsize: | ||
out += f"[text size: { | out += f"[text size: {int.from_bytes(text[i], byte_order)}]" | ||
i += extra_size | i += extra_size | ||
continue | continue | ||
if tag == '\x00\x03': | if tag == '\x00\x03': | ||
if text[i] == '\ | if text[i:i+2] == b'\xff\xff': | ||
# out += "[white]" | # out += "[white]" | ||
if color: | if color: | ||
out += f"}}}}" | out += f"}}}}" | ||
color = False | color = False | ||
elif | elif int.from_bytes(text[i:i+2], byte_order) in colors: | ||
# out += "[orange]" | # out += "[orange]" | ||
if color: | if color: | ||
out += f"}}}}" | out += f"}}}}" | ||
out += f"{{{{color|2={colors[ | out += f"{{{{color|2={colors[int.from_bytes(text[i:i+2], byte_order)]}|" | ||
color = True | color = True | ||
else: | else: | ||
out += f"[unknown color: {hex( | out += f"[unknown color: {hex(int.from_bytes(text[i:i+2], byte_order))}]" | ||
i += extra_size | i += extra_size | ||
continue | continue | ||
Line 89: | Line 100: | ||
if tag[0] == '\x01': | if tag[0] == '\x01': | ||
name_len = | name_len = int.from_bytes(text[i:i+2], byte_order) | ||
var_name = text[i+ | var_name = decode(text[i+2 : i+2 + name_len], byte_order) | ||
assert(text[i+ | assert(text[i+2 + name_len : i+2 + name_len + 4] == b"\x01\x00\x00\x00") | ||
if include_variables: | if include_variables: | ||
out += f"[variable: {hex( | out += f"[variable: {hex(int.from_bytes(text[i-4:i-2]))} \"{var_name}\"]" | ||
else: | else: | ||
out += "_" | out += "_" | ||
Line 100: | Line 111: | ||
if tag == '\x07\x01': | if tag == '\x07\x01': | ||
assert(text[i+1] == 0xcd) | |||
button_id = text[i] | |||
button_id | button_name = hex(button_id) | ||
out += f"[button: { | if button_id < len(buttons): | ||
button_name = buttons[button_id] | |||
out += f"[button: {button_name}]" | |||
i += extra_size | i += extra_size | ||
continue | continue | ||
Line 109: | Line 122: | ||
if tag == '\x15\x00': | if tag == '\x15\x00': | ||
if include_sound: | if include_sound: | ||
detail = | detail = int.from_bytes(text[i:i+2]) | ||
high = detail >> 8 | high = detail >> 8 | ||
low = detail & 0x00FF | low = detail & 0x00FF | ||
Line 119: | Line 132: | ||
if tag == '\x15\x01': | if tag == '\x15\x01': | ||
if include_lookat: | if include_lookat: | ||
detail = | detail = int.from_bytes(text[i:i+2]) | ||
character_name = characters[detail >> 8] | character_name = characters[detail >> 8] | ||
target = characters[detail & 0x00FF] | target = characters[detail & 0x00FF] | ||
out += f"[look at: {character_name} -> {target}]" | out += f"[look at: {character_name} -> {target}]" | ||
i += extra_size | |||
continue | |||
if tag == '\x17\x00': | |||
name_len = int.from_bytes(text[i:i+2], byte_order) | |||
name = text[i+2 : i+2 + name_len] | |||
out += f"[no text: \"{decode(name, byte_order)}\"]" | |||
i += extra_size | i += extra_size | ||
continue | continue | ||
Line 128: | Line 148: | ||
if tag == '\x17\x01': | if tag == '\x17\x01': | ||
if include_pauses: | if include_pauses: | ||
out += f"[pause: { | out += f"[pause: {int.from_bytes(text[i:i+2], byte_order)}]" | ||
i += extra_size | i += extra_size | ||
continue | continue | ||
if tag == '\x17\x02': | if tag == '\x17\x02': | ||
detail = | detail = int.from_bytes(text[i:i+2]) | ||
high = detail >> 8 | high = detail >> 8 | ||
low = detail & 0x00FF | low = detail & 0x00FF | ||
Line 145: | Line 165: | ||
out += f"(speaker mismatch, should be {characters[high]}) " | out += f"(speaker mismatch, should be {characters[high]}) " | ||
else: | else: | ||
if 0x1f <= low <= 0x23 : | |||
character_id = 6 | |||
out += f"{{{{icon|{ | emote = emotes[low - 0x1f] | ||
if high != | else: | ||
character_id = low % 5 | |||
emote = "neutral" if high == 5 else emotes[low // 5] | |||
out += f"{{{{icon|{characters[character_id]}|v={emote}}}}} " | |||
if high != character_id: | |||
out += f"(speaker mismatch, should be {characters[high]}) " | out += f"(speaker mismatch, should be {characters[high]}) " | ||
Line 161: | Line 185: | ||
if tag == '\x17\x04': | if tag == '\x17\x04': | ||
name_len = | name_len = int.from_bytes(text[i:i+2], byte_order) | ||
sound_name = text[i+ | sound_name = decode(text[i+2 : i+2 + name_len], byte_order) | ||
if include_sound: | if include_sound: | ||
out += f"[sound: \"{sound_name}\", { | out += f"[sound: \"{sound_name}\", {text[i+2 + name_len]}]" | ||
i += extra_size | i += extra_size | ||
continue | continue | ||
out += f"[unknown tag: {ord(tag[0])}, {ord(tag[1])}: \"{text[i:i+extra_size]}\"]" | out += f"[unknown tag: {hex(ord(tag[0]))}, {hex(ord(tag[1]))}: \"{text[i+4:i+extra_size]}\"]" | ||
i += extra_size | i += extra_size | ||
continue | continue | ||
out += text[i] | out += decode(text[i:i+2], byte_order) | ||
i += | i += 2 | ||
return out | return out | ||
with open(filename, ' | |||
with open(filename, 'rb') as f: | |||
text = f.read() | |||
for ( | byte_order_mark = text[8:10] | ||
byte_order = "big" if byte_order_mark == b'\xfe\xff' else "little" | |||
text = text[0x20:] | |||
blocks = {} | |||
while len(text) != 0: | |||
block_sign = text[:4].decode() | |||
text = text[4:] | |||
block_size = int.from_bytes(text[:4], byte_order) | |||
text = text[4:] | |||
text = text[8:] | |||
blocks[block_sign] = text[:block_size] | |||
text = text[block_size:] | |||
if len(text) == 0: | |||
break | |||
while text[0] == 0xab: | |||
text = text[1:] | |||
if len(text) == 0: | |||
break | |||
block = blocks["LBL1"] | |||
label_group_count = int.from_bytes(block[:4], byte_order) | |||
labels = [] | |||
for group_id in range(label_group_count): | |||
label_count = int.from_bytes(block[4 + 8*group_id : 4 + 8*group_id + 4], byte_order) | |||
offset = int.from_bytes(block[4 + 8*group_id + 4 : 4 + 8*group_id + 8], byte_order) | |||
i = offset | |||
for label_id in range(label_count): | |||
label_size = int.from_bytes(block[i:i+1], byte_order) | |||
i += 1 | |||
try: | |||
label = block[i : i + label_size].decode() | |||
i += label_size | |||
labels.append((label, int.from_bytes(block[i:i+4], byte_order))) | |||
i += 4 | |||
except UnicodeDecodeError: | |||
i += 4 | |||
block = blocks["TXT2"] | |||
message_count = int.from_bytes(block[:4], byte_order) | |||
message_offsets = [] | |||
for message_id in range(message_count): | |||
offset = int.from_bytes(block[4 + 4*message_id : 4 + 4*message_id + 4], byte_order) | |||
message_offsets.append(offset) | |||
message_offsets.append(len(block)) | |||
messages = {} | |||
for label, message_id in labels: | |||
if message_id > message_count: | |||
continue | |||
start_offset = message_offsets[message_id] | |||
end_offset = len(block) | |||
for offset in message_offsets: | |||
if offset > start_offset and offset < end_offset: | |||
end_offset = offset | |||
raw_message = block[start_offset:end_offset] | |||
message = "" | |||
for i in range(0,len(raw_message),2): | |||
message += chr(int.from_bytes(raw_message[i:i+2], byte_order)) | |||
messages[label] = clean_text(raw_message, byte_order)[:-1] | |||
sorted_labels = sorted(messages.keys()) | |||
for label in sorted_labels: | |||
print(f"=={label}==\n{messages[label]}\n") | |||
</nowiki> |
Revision as of 10:54, January 21, 2025
Python script to clean up files from the Pikmin 3 text dump
# File from the text dump filename = "Strings/Raw/Pikmin 3/EU ROM/EUEng/Chat.msbt" # Change these for your purposes textbox_separator = "\n<br>\n" include_pauses = False include_sound = False include_autoadvance = False include_lookat = False include_textsize = True include_variables = False include_ruby = True # Change text colors here colors = { # emphasis 0x01: "orange", 0x02: "#AA0", 0x03: "cyan", 0x26: "goldenrod", # pikmin colors 0x0b: "red", 0x0e: "#AA0", 0x11: "blue", 0x14: "grey", 0x17: "pink", # leader colors 0x1a: "lightblue", 0x1d: "pink", 0x20: "lime", # ultra-spicy 0x25: "orange",} def decode(bytes, byte_order = "little"): if byte_order == "big": return bytes.decode("utf-16-be") else: return bytes.decode("utf-16-le") def clean_text(text, byte_order): characters = ("Alph", "Brittany", "Charlie", "Louie", "Olimar", "S.S. Drake", "President") emotes = ("neutral", "shocked", "happy", "sad", "mad") buttons = [ '\\00', '\\01', '\\02', '\\03', '\\04', '\\05', '\\06', '\\07', '\\08', '\\09', 'photo flash', 'photo', '\\0c', '\\0d', '\\0e', '\\0f', '\\10', '\\11', '\\12', '\\13', 'roll right', '\\15', 'roll left', 'photo zoom', 'use onion', 'move', 'throw', 'whistle', 'dismiss', 'change type', 'reset camera', 'lock-on', 'charge', 'switch leader', 'spray', '\\23', '\\24', '\\25', '\\26', '\\27', '\\28', '\\29', '\\2a', '\\2b', 'pluck', 'cancel', '\\2e', 'shake', ] out = "" i = 0 color = False while i < len(text): tmp = decode(text[i:i+2], byte_order) if decode(text[i:i+2], byte_order) == '\x0e': i += 2 tag = decode(text[i:i+4], byte_order) extra_size = int.from_bytes(text[i+4:i+6], byte_order) i += 6 if tag == '\x00\x00': if include_ruby: tmp = text[i:i+4] kanji_count = int.from_bytes(text[i:i+2], byte_order) furigana_count = int.from_bytes(text[i+2:i+4], byte_order) furigana = text[i+4 : i+4 + furigana_count] kanji = text[i+4 + furigana_count : i+4 + furigana_count + kanji_count] out += f"<ruby>{decode(kanji, byte_order)}<rt>{decode(furigana, byte_order)}</rt></ruby>" i += kanji_count i += extra_size continue if tag == '\x00\x02': if include_textsize: out += f"[text size: {int.from_bytes(text[i], byte_order)}]" i += extra_size continue if tag == '\x00\x03': if text[i:i+2] == b'\xff\xff': # out += "[white]" if color: out += f"}}}}" color = False elif int.from_bytes(text[i:i+2], byte_order) in colors: # out += "[orange]" if color: out += f"}}}}" out += f"{{{{color|2={colors[int.from_bytes(text[i:i+2], byte_order)]}|" color = True else: out += f"[unknown color: {hex(int.from_bytes(text[i:i+2], byte_order))}]" i += extra_size continue if tag == '\x00\x04': out += textbox_separator i += extra_size continue if tag[0] == '\x01': name_len = int.from_bytes(text[i:i+2], byte_order) var_name = decode(text[i+2 : i+2 + name_len], byte_order) assert(text[i+2 + name_len : i+2 + name_len + 4] == b"\x01\x00\x00\x00") if include_variables: out += f"[variable: {hex(int.from_bytes(text[i-4:i-2]))} \"{var_name}\"]" else: out += "_" i += extra_size continue if tag == '\x07\x01': assert(text[i+1] == 0xcd) button_id = text[i] button_name = hex(button_id) if button_id < len(buttons): button_name = buttons[button_id] out += f"[button: {button_name}]" i += extra_size continue if tag == '\x15\x00': if include_sound: detail = int.from_bytes(text[i:i+2]) high = detail >> 8 low = detail & 0x00FF character = characters[high] out += f"[{character} thinking sound]" i += extra_size continue if tag == '\x15\x01': if include_lookat: detail = int.from_bytes(text[i:i+2]) character_name = characters[detail >> 8] target = characters[detail & 0x00FF] out += f"[look at: {character_name} -> {target}]" i += extra_size continue if tag == '\x17\x00': name_len = int.from_bytes(text[i:i+2], byte_order) name = text[i+2 : i+2 + name_len] out += f"[no text: \"{decode(name, byte_order)}\"]" i += extra_size continue if tag == '\x17\x01': if include_pauses: out += f"[pause: {int.from_bytes(text[i:i+2], byte_order)}]" i += extra_size continue if tag == '\x17\x02': detail = int.from_bytes(text[i:i+2]) high = detail >> 8 low = detail & 0x00FF if low == 0x19: character_name = characters[high] out += f"{character_name} (no icon): " elif low == 0x1a: character_name = characters[high] out += f"{{{{icon|S.S. Drake}}}} " if high != 5: out += f"(speaker mismatch, should be {characters[high]}) " else: if 0x1f <= low <= 0x23 : character_id = 6 emote = emotes[low - 0x1f] else: character_id = low % 5 emote = "neutral" if high == 5 else emotes[low // 5] out += f"{{{{icon|{characters[character_id]}|v={emote}}}}} " if high != character_id: out += f"(speaker mismatch, should be {characters[high]}) " i += extra_size continue if tag == '\x17\x03': if include_autoadvance: out += f"[advance automatically]" i += extra_size continue if tag == '\x17\x04': name_len = int.from_bytes(text[i:i+2], byte_order) sound_name = decode(text[i+2 : i+2 + name_len], byte_order) if include_sound: out += f"[sound: \"{sound_name}\", {text[i+2 + name_len]}]" i += extra_size continue out += f"[unknown tag: {hex(ord(tag[0]))}, {hex(ord(tag[1]))}: \"{text[i+4:i+extra_size]}\"]" i += extra_size continue out += decode(text[i:i+2], byte_order) i += 2 return out with open(filename, 'rb') as f: text = f.read() byte_order_mark = text[8:10] byte_order = "big" if byte_order_mark == b'\xfe\xff' else "little" text = text[0x20:] blocks = {} while len(text) != 0: block_sign = text[:4].decode() text = text[4:] block_size = int.from_bytes(text[:4], byte_order) text = text[4:] text = text[8:] blocks[block_sign] = text[:block_size] text = text[block_size:] if len(text) == 0: break while text[0] == 0xab: text = text[1:] if len(text) == 0: break block = blocks["LBL1"] label_group_count = int.from_bytes(block[:4], byte_order) labels = [] for group_id in range(label_group_count): label_count = int.from_bytes(block[4 + 8*group_id : 4 + 8*group_id + 4], byte_order) offset = int.from_bytes(block[4 + 8*group_id + 4 : 4 + 8*group_id + 8], byte_order) i = offset for label_id in range(label_count): label_size = int.from_bytes(block[i:i+1], byte_order) i += 1 try: label = block[i : i + label_size].decode() i += label_size labels.append((label, int.from_bytes(block[i:i+4], byte_order))) i += 4 except UnicodeDecodeError: i += 4 block = blocks["TXT2"] message_count = int.from_bytes(block[:4], byte_order) message_offsets = [] for message_id in range(message_count): offset = int.from_bytes(block[4 + 4*message_id : 4 + 4*message_id + 4], byte_order) message_offsets.append(offset) message_offsets.append(len(block)) messages = {} for label, message_id in labels: if message_id > message_count: continue start_offset = message_offsets[message_id] end_offset = len(block) for offset in message_offsets: if offset > start_offset and offset < end_offset: end_offset = offset raw_message = block[start_offset:end_offset] message = "" for i in range(0,len(raw_message),2): message += chr(int.from_bytes(raw_message[i:i+2], byte_order)) messages[label] = clean_text(raw_message, byte_order)[:-1] sorted_labels = sorted(messages.keys()) for label in sorted_labels: print(f"=={label}==\n{messages[label]}\n")