# originally written in lua by kaesa import re PAT_EMOTE = r"[^\s:]" PAT_BBCODE_TAG = r"\w" PAT_BBCODE_ATTR = r"[^\s\]]" PAT_LINK = r"https?:\/\/[\w\-_.?:\/=&~@#%]+[\w\-\/]" class Parser: def __init__(self, src_str): self.valid_bbcode_tags = [] self.valid_emotes = [] self.bbcode_tags_only_text_children = [], self.source = src_str self.position = 0 self.position_stack = [] def advance(self, count = 1): self.position += count def is_end_of_source(self, offset = 0): return self.position + offset >= len(self.source) def save_position(self): self.position_stack.append(self.position) def restore_position(self): self.position = self.position_stack.pop() def forget_position(self): self.position_stack.pop() def peek_char(self, offset = 0): if self.is_end_of_source(offset): return "" return self.source[self.position + offset] def get_char(self): char = self.peek_char() self.advance() return char def check_char(self, wanted): char = self.peek_char() if char == wanted: self.advance() return True return False def check_str(self, wanted): self.save_position() # for each char in wanted for i in range(len(wanted)): if not self.check_char(wanted[i]): self.restore_position() return False self.forget_position() return True def match_pattern(self, pattern): buf = "" while not self.is_end_of_source(): ch = self.peek_char() if not re.match(pattern, ch): break self.advance() buf = buf + ch return buf def parse_emote(self): self.save_position() if not self.check_char(":"): self.restore_position() return None name = self.match_pattern(PAT_EMOTE) if not self.check_char(":"): self.restore_position() return None if not name in self.valid_emotes: self.restore_position() return None self.forget_position() return { "type": "emote", "name": name } def parse_bbcode_open(self): self.save_position() if not self.check_char("["): self.restore_position() return None, None name = self.match_pattern(PAT_BBCODE_TAG) if name == "": self.restore_position() return None, None attr = None if self.check_char("="): attr = self.match_pattern(PAT_BBCODE_ATTR) if not self.check_char("]"): self.restore_position() return None, None if not name in self.valid_bbcode_tags: self.restore_position() return None, None self.forget_position() return name, attr def parse_bbcode(self): self.save_position() name, attr = self.parse_bbcode_open() if name is None: self.restore_position() return None children = [] while not self.is_end_of_source(): if self.check_str(f"[/{name}]"): break if name in self.bbcode_tags_only_text_children: ch = self.get_char() if len(children) == 0: children.append(ch) else: children[0] = children[0] + ch else: element = self.parse_element(children) if element is None: self.restore_position() return None children.append(element) self.forget_position() return { "type": "bbcode", "name": name, "attr": attr, "children": children, } def parse_rule(self): if not self.check_str("---"): return None return { "type": "rule" } def parse_link(self): self.save_position() # extract printable chars (extreme hack edition) word = self.match_pattern(r'[ -~]') if not re.match(PAT_LINK, word): self.restore_position() return None self.forget_position() return { "type": "link", "url": word } def parse_element(self, siblings): if self.is_end_of_source(): return None element = self.parse_emote() \ or self.parse_bbcode() \ or self.parse_rule() \ or self.parse_link() if element is None: if len(siblings) > 0: last = siblings[-1] if isinstance(last, str): siblings.pop() return last + self.get_char() return self.get_char() return element def parse(self): elements = [] while True: element = self.parse_element(elements) if element is None: break elements.append(element) return elements