pyrom/app/lib/babycode_parser.py

242 lines
5.2 KiB
Python

# originally written in lua by kaesa
import re
PAT_EMOTE = r"[^\s:]"
PAT_BBCODE_TAG = r"\w"
PAT_BBCODE_ATTR = r"[^\s\]]"
PAT_LINK = r"https?:\/\/[\w\-_.?:\/=&~@#%]+[\w\-\/]"
class Parser:
def __init__(self, src_str):
self.valid_bbcode_tags = []
self.valid_emotes = []
self.bbcode_tags_only_text_children = [],
self.source = src_str
self.position = 0
self.position_stack = []
def advance(self, count = 1):
self.position += count
def is_end_of_source(self, offset = 0):
return self.position + offset >= len(self.source)
def save_position(self):
self.position_stack.append(self.position)
def restore_position(self):
self.position = self.position_stack.pop()
def forget_position(self):
self.position_stack.pop()
def peek_char(self, offset = 0):
if self.is_end_of_source(offset):
return ""
return self.source[self.position + offset]
def get_char(self):
char = self.peek_char()
self.advance()
return char
def check_char(self, wanted):
char = self.peek_char()
if char == wanted:
self.advance()
return True
return False
def check_str(self, wanted):
self.save_position()
# for each char in wanted
for i in range(len(wanted)):
if not self.check_char(wanted[i]):
self.restore_position()
return False
self.forget_position()
return True
def match_pattern(self, pattern):
buf = ""
while not self.is_end_of_source():
ch = self.peek_char()
if not re.match(pattern, ch):
break
self.advance()
buf = buf + ch
return buf
def parse_emote(self):
self.save_position()
if not self.check_char(":"):
self.restore_position()
return None
name = self.match_pattern(PAT_EMOTE)
if not self.check_char(":"):
self.restore_position()
return None
if not name in self.valid_emotes:
self.restore_position()
return None
self.forget_position()
return {
"type": "emote",
"name": name
}
def parse_bbcode_open(self):
self.save_position()
if not self.check_char("["):
self.restore_position()
return None, None
name = self.match_pattern(PAT_BBCODE_TAG)
if name == "":
self.restore_position()
return None, None
attr = None
if self.check_char("="):
attr = self.match_pattern(PAT_BBCODE_ATTR)
if not self.check_char("]"):
self.restore_position()
return None, None
if not name in self.valid_bbcode_tags:
self.restore_position()
return None, None
self.forget_position()
return name, attr
def parse_bbcode(self):
self.save_position()
name, attr = self.parse_bbcode_open()
if name is None:
self.restore_position()
return None
children = []
while not self.is_end_of_source():
if self.check_str(f"[/{name}]"):
break
if name in self.bbcode_tags_only_text_children:
ch = self.get_char()
if len(children) == 0:
children.append(ch)
else:
children[0] = children[0] + ch
else:
element = self.parse_element(children)
if element is None:
self.restore_position()
return None
children.append(element)
self.forget_position()
return {
"type": "bbcode",
"name": name,
"attr": attr,
"children": children,
}
def parse_rule(self):
if not self.check_str("---"):
return None
return {
"type": "rule"
}
def parse_link(self):
self.save_position()
# extract printable chars (extreme hack edition)
word = self.match_pattern(r'[ -~]')
if not re.match(PAT_LINK, word):
self.restore_position()
return None
self.forget_position()
return {
"type": "link",
"url": word
}
def parse_element(self, siblings):
if self.is_end_of_source():
return None
element = self.parse_emote() \
or self.parse_bbcode() \
or self.parse_rule() \
or self.parse_link()
if element is None:
if len(siblings) > 0:
last = siblings[-1]
if isinstance(last, str):
siblings.pop()
return last + self.get_char()
return self.get_char()
return element
def parse(self):
elements = []
while True:
element = self.parse_element(elements)
if element is None:
break
elements.append(element)
return elements