From fc6c5d46e103fe27f77248da7b29e6a6343636b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lera=20Elvo=C3=A9?= Date: Sat, 13 Dec 2025 07:36:49 +0300 Subject: [PATCH] refactor babycode lib to have different code paths for html and rss-friendly generation --- app/__init__.py | 36 +-- app/lib/babycode.py | 586 +++++++++++++++++++++++++++++++------------- 2 files changed, 440 insertions(+), 182 deletions(-) diff --git a/app/__init__.py b/app/__init__.py index aeb1545..2b6297e 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -167,20 +167,6 @@ def create_app(): allowed_themes.sort(key=(lambda x: (x != 'style', x))) app.config['allowed_themes'] = allowed_themes - with app.app_context(): - from .schema import create as create_tables - from .migrations import run_migrations - create_tables() - run_migrations() - - create_default_avatar() - create_admin() - create_deleted_user() - - reparse_babycode() - - bind_default_badges(app.config['BADGES_PATH']) - from app.routes.app import bp as app_bp from app.routes.topics import bp as topics_bp from app.routes.threads import bp as threads_bp @@ -200,6 +186,20 @@ def create_app(): app.register_blueprint(hyperapi_bp) app.register_blueprint(guides_bp) + with app.app_context(): + from .schema import create as create_tables + from .migrations import run_migrations + create_tables() + run_migrations() + + create_default_avatar() + create_admin() + create_deleted_user() + + reparse_babycode() + + bind_default_badges(app.config['BADGES_PATH']) + app.config['SESSION_COOKIE_SECURE'] = True @app.before_request @@ -251,12 +251,12 @@ def create_app(): return permission_level_string(term) @app.template_filter('babycode') - def babycode_filter(markup): - return babycode_to_html(markup).result + def babycode_filter(markup, nofrag=False): + return babycode_to_html(markup, fragment=not nofrag).result @app.template_filter('babycode_strict') - def babycode_strict_filter(markup): - return babycode_to_html(markup, STRICT_BANNED_TAGS).result + def babycode_strict_filter(markup, nofrag=False): + return babycode_to_html(markup, banned_tags=STRICT_BANNED_TAGS, fragment=not nofrag).result @app.template_filter('extract_h2') def extract_h2(content): diff --git a/app/lib/babycode.py b/app/lib/babycode.py index 13e4a8f..f8e1a2a 100644 --- a/app/lib/babycode.py +++ b/app/lib/babycode.py @@ -6,7 +6,25 @@ from pygments.lexers import get_lexer_by_name from pygments.util import ClassNotFound as PygmentsClassNotFound import re -class BabycodeParseResult: +BABYCODE_VERSION = 7 + +class BabycodeError(Exception): + pass + +class BabycodeRenderError(BabycodeError): + pass + +class UnknownASTElementError(BabycodeRenderError): + def __init__(self, element_type, element=None): + self.element_type = element_type + self.element = element + + message = f'Unknown AST element: {element_type}' + if element: + message += f' (element: {element})' + super().__init__(message) + +class BabycodeRenderResult: def __init__(self, result, mentions=[]): self.result = result self.mentions = mentions @@ -15,8 +33,177 @@ class BabycodeParseResult: def __str__(self): return self.result + def dumps(self): + return self.result + + +class BabycodeRenderer: + def __init__(self, tag_map, void_tag_map, emote_map, fragment=False): + self.tag_map = tag_map + self.void_tag_map = void_tag_map + self.emote_map = emote_map + self.fragment = fragment + + def make_mention(self, element): + raise NotImplementedError + + def transform_para_whitespace(self, text): + # markdown rules: + # two spaces at end of line ->
+ text = re.sub(r' +\n', '
', text) + # single newlines -> space (collapsed) + text = re.sub(r'\n', ' ', text) + return text + + def wrap_in_paragraphs(self, nodes, context_is_block=True, is_root=False): + result = [] + current_paragraph = [] + is_first_para = is_root and self.fragment + + def flush_paragraph(): + # TIL nonlocal exists + nonlocal result, current_paragraph, is_first_para + if not current_paragraph: + return + + para_content = ''.join(current_paragraph) + if para_content.strip(): # skip empty paragraphs + if is_first_para: + result.append(para_content) + is_first_para = False + else: + result.append(f"

{para_content}

") + current_paragraph.clear() + + for node in nodes: + if isinstance(node, str): + paras = re.split(r'\n\n+', node) + for i, para in enumerate(paras): + if i > 0 and context_is_block: + flush_paragraph() + + if para: + processed = self.transform_para_whitespace(para) + current_paragraph.append(processed) + else: + inline = is_inline(node) + + if inline and context_is_block: + # inline child within a paragraph context + current_paragraph.append(self.fold(node)) + elif not inline and context_is_block: + # block child within a block context + flush_paragraph() + if is_root: + # this is relevant for fragment. + # fragment only applies to the first inline node(s). + # if the first element is a block, reset "fragment mode". + is_first_para = False + result.append(self.fold(node)) + else: + # either inline in inline context, or block in inline context + current_paragraph.append(self.fold(node)) + + if context_is_block: + # flush final para if we're in a block context + flush_paragraph() + elif current_paragraph: + # inline context - just append whatever we collected + result.append(''.join(current_paragraph)) + + return ''.join(result) + + def fold(self, element): + if isinstance(element, str): + return element + + match element['type']: + case 'bbcode': + tag_name = element['name'] + + if is_inline(element): + # inline tag + # since its inline, all children should be processed inline + content = "".join(self.fold(child) for child in element['children']) + return self.tag_map[tag_name](content, element['attr']) + else: + # block tag + if tag_name in {'ul', 'ol', 'code', 'img'}: + # these handle their own internal structure + content = ''.join( + child if isinstance(child, str) else self.fold(child) + for child in element['children'] + ) + return self.tag_map[tag_name](content, element['attr']) + else: + # block elements that can contain paragraphs + content = self.wrap_in_paragraphs(element['children'], context_is_block=True, is_root=False) + return self.tag_map[tag_name](content, element['attr']) + case 'bbcode_void': + return self.void_tag_map[element['name']](element['attr']) + case 'link': + return f"{element['url']}" + case 'emote': + return self.emote_map[element['name']] + case 'rule': + return '
' + case 'mention': + return self.make_mention(element) + case _: + raise UnknownASTElementError( + element_type=element['type'], + element=element + ) + + def render(self, ast): + out = self.wrap_in_paragraphs(ast, context_is_block=True, is_root=True) + return out + + +class HTMLRenderer(BabycodeRenderer): + def __init__(self, fragment=False): + super().__init__(TAGS, VOID_TAGS, EMOJI, fragment) + + self.mentions = [] + + def make_mention(self, e): + from ..models import Users + from flask import url_for, current_app + with current_app.test_request_context('/'): + target_user = Users.find({'username': e['name'].lower()}) + if not target_user: + return f"@{e['name']}" + + mention_data = { + 'mention_text': f"@{e['name']}", + 'mentioned_user_id': int(target_user.id), + "start": e['start'], + "end": e['end'], + } + if mention_data not in self.mentions: + self.mentions.append(mention_data) + + return f"{'@' if not target_user.has_display_name() else ''}{target_user.get_readable_name()}" + + def render(self, ast): + out = super().render(ast) + return BabycodeRenderResult(out, self.mentions) + + +class RSSXMLRenderer(BabycodeRenderer): + def __init__(self, fragment=False): + super().__init__(RSS_TAGS, VOID_TAGS, RSS_EMOJI, fragment) + + def make_mention(self, element): + from ..models import Users + from flask import url_for, current_app + with current_app.test_request_context('/'): + target_user = Users.find({'username': e['name'].lower()}) + if not target_user: + return f"@{e['name']}" + + return f'{target_user.get_readable_name()}' -BABYCODE_VERSION = 5 NAMED_COLORS = [ 'black', 'silver', 'gray', 'white', 'maroon', 'red', @@ -49,111 +236,6 @@ NAMED_COLORS = [ 'violet', 'wheat', 'white', 'whitesmoke', 'yellow', 'yellowgreen', ] -def is_tag(e, tag=None): - if e is None: - return False - if isinstance(e, str): - return False - if e['type'] != 'bbcode': - return False - - if tag is None: - return True - - return e['name'] == tag - -def is_text(e): - return isinstance(e, str) - -def tag_code(children, attr, surrounding): - is_inline = children.find('\n') == -1 - if is_inline: - return f"{children}" - else: - input_code = children.strip() - button = f"" - unhighlighted = f"
code block{button}{input_code}
" - if not attr: - return unhighlighted - try: - lexer = get_lexer_by_name(attr.strip()) - formatter = HtmlFormatter(nowrap=True) - return f"
{lexer.name}{button}{highlight(input_code.unescape(), lexer, formatter)}
" - except PygmentsClassNotFound: - return unhighlighted - -def tag_list(children): - list_body = re.sub(r" +\n", "
", children.strip()) - list_body = re.sub(r"\n\n+", "\1", list_body) - return " ".join([f"
  • {x}
  • " for x in list_body.split("\1") if x]) - -def tag_color(children, attr, surrounding): - if not attr: - return f"[color]{children}[/color]" - - hex_re = r"^#?([0-9a-f]{6}|[0-9a-f]{3})$" - potential_color = attr.lower().strip() - - if potential_color in NAMED_COLORS: - return f"{children}" - - m = re.match(hex_re, potential_color) - if m: - return f"{children}" - - # return just the way it was if we can't parse it - return f"[color={attr}]{children}[/color]" - -def tag_spoiler(children, attr, surrounding): - spoiler_name = attr if attr else "Spoiler" - content = f"" - container = f"""""" - return container - -def tag_image(children, attr, surrounding): - img = f"\"{children}\"" - if not is_tag(surrounding[0], 'img'): - img = f"
    {img}" - if not is_tag(surrounding[1], 'img'): - img = f"{img}
    " - return img - -TAGS = { - "b": lambda children, attr, _: f"{children}", - "i": lambda children, attr, _: f"{children}", - "s": lambda children, attr, _: f"{children}", - "u": lambda children, attr, _: f"{children}", - - "img": tag_image, - "url": lambda children, attr, _: f"{children}", - "quote": lambda children, attr, _: f"
    {children}
    ", - "code": tag_code, - "ul": lambda children, attr, _: f"", - "ol": lambda children, attr, _: f"
      {tag_list(children)}
    ", - - "big": lambda children, attr, _: f"{children}", - "small": lambda children, attr, _: f"{children}", - "color": tag_color, - - "center": lambda children, attr, _: f"
    {children}
    ", - "right": lambda children, attr, _: f"
    {children}
    ", - - "spoiler": tag_spoiler, -} - -VOID_TAGS = { - 'lb': lambda attr: '[', - 'rb': lambda attr: ']', - '@': lambda attr: '@', -} - -# [img] is considered block for the purposes of collapsing whitespace, -# despite being potentially inline (since the resulting tag is inline, but creates a block container around itself and sibling images). -# [code] has a special case in is_inline(). -INLINE_TAGS = { - 'b', 'i', 's', 'u', 'color', 'big', 'small', 'url' -} - def make_emoji(name, code): return f' {name}' @@ -203,12 +285,173 @@ EMOJI = { 'wink': make_emoji('wink', 'wink'), } +RSS_EMOJI = { + **EMOJI, + + 'angry': '😡', + + '(': '🙁', + + 'D': '😃', + + 'imp': '😈', + + 'angryimp': '👿', + 'impangry': '👿', + + 'lobster': '🦞', + + '|': '😐', + + 'pensive': '😔', + + 'scissors': '✂️', + + ')': '🙂', + + 'smiletear': '🥲', + 'crytear': '🥲', + + ',': '😭', + 'T': '😭', + 'cry': '😭', + 'sob': '😭', + + 'o': '😮', + 'O': '😮', + + 'hmm': '🤔', + 'think': '🤔', + 'thinking': '🤔', + + 'P': '😛', + 'p': '😛', + + 'weary': '😩', + + ';': '😉', + 'wink': '😉', +} + TEXT_ONLY = ["code"] -def break_lines(text): - text = re.sub(r" +\n", "
    ", text) - text = re.sub(r"\n\n+", "

    ", text) - return text +def tag_code(children, attr): + is_inline = children.find('\n') == -1 + if is_inline: + return f"{children}" + else: + input_code = children.strip() + button = f"" + unhighlighted = f"
    code block{button}{input_code}
    " + if not attr: + return unhighlighted + try: + lexer = get_lexer_by_name(attr.strip()) + formatter = HtmlFormatter(nowrap=True) + return f"
    {lexer.name}{button}{highlight(Markup(input_code).unescape(), lexer, formatter)}
    " + except PygmentsClassNotFound: + return unhighlighted + +def tag_list(children): + list_body = re.sub(r" +\n", "
    ", children.strip()) + list_body = re.sub(r"\n\n+", "\1", list_body) + return " ".join([f"
  • {x}
  • " for x in list_body.split("\1") if x]) + +def tag_color(children, attr): + if not attr: + return f"[color]{children}[/color]" + + hex_re = r"^#?([0-9a-f]{6}|[0-9a-f]{3})$" + potential_color = attr.lower().strip() + + if potential_color in NAMED_COLORS: + return f"{children}" + + m = re.match(hex_re, potential_color) + if m: + return f"{children}" + + # return just the way it was if we can't parse it + return f"[color={attr}]{children}[/color]" + +def tag_spoiler(children, attr): + spoiler_name = attr if attr else "Spoiler" + content = f"" + container = f"""""" + return container + +def tag_image(children, attr): + img = f"\"{children}\"" + return f"
    {img}
    " + +TAGS = { + "b": lambda children, attr: f"{children}", + "i": lambda children, attr: f"{children}", + "s": lambda children, attr: f"{children}", + "u": lambda children, attr: f"{children}", + + "img": tag_image, + "url": lambda children, attr: f"{children}", + "quote": lambda children, attr: f"
    {children}
    ", + "code": tag_code, + "ul": lambda children, attr: f"", + "ol": lambda children, attr: f"
      {tag_list(children)}
    ", + + "big": lambda children, attr: f"{children}", + "small": lambda children, attr: f"{children}", + "color": tag_color, + + "center": lambda children, attr: f"
    {children}
    ", + "right": lambda children, attr: f"
    {children}
    ", + + "spoiler": tag_spoiler, +} + +def tag_code_rss(children, attr): + is_inline = children.find('\n') == -1 + if is_inline: + return f'{children}' + else: + return f'
    {children}
    ' + +RSS_TAGS = { + **TAGS, + 'img': lambda children, attr: f'{children}', + 'spoiler': lambda children, attr: f'
    {attr or "Spoiler"}{children}
    ', + 'code': tag_code_rss, + + 'big': lambda children, attr: f'{children}', + 'small': lambda children, attr: f'{children}' +} + +VOID_TAGS = { + 'lb': lambda attr: '[', + 'rb': lambda attr: ']', + '@': lambda attr: '@', +} + +# [img] is considered block for the purposes of collapsing whitespace, +# despite being potentially inline (since the resulting tag is inline, but creates a block container around itself and sibling images). +# [code] has a special case in is_inline(). +INLINE_TAGS = { + 'b', 'i', 's', 'u', 'color', 'big', 'small', 'url', 'lb', 'rb', '@' +} + +def is_tag(e, tag=None): + if e is None: + return False + if isinstance(e, str): + return False + if e['type'] != 'bbcode' and e['type'] != 'bbcode_void': + return False + + if tag is None: + return True + + return e['name'] == tag + +def is_text(e): + return isinstance(e, str) def is_inline(e): if e is None: @@ -219,7 +462,7 @@ def is_inline(e): if is_tag(e): if is_tag(e, 'code'): # special case, since [code] can be inline OR block - return '\n' not in e['children'] + return '\n' not in e['children'][0] return e['name'] in INLINE_TAGS @@ -227,21 +470,22 @@ def is_inline(e): def make_mention(e, mentions): from ..models import Users - from flask import url_for - target_user = Users.find({'username': e['name'].lower()}) - if not target_user: - return f"@{e['name']}" + from flask import url_for, current_app + with current_app.test_request_context('/'): + target_user = Users.find({'username': e['name'].lower()}) + if not target_user: + return f"@{e['name']}" - mention_data = { - 'mention_text': f"@{e['name']}", - 'mentioned_user_id': int(target_user.id), - "start": e['start'], - "end": e['end'], - } - if mention_data not in mentions: - mentions.append(mention_data) + mention_data = { + 'mention_text': f"@{e['name']}", + 'mentioned_user_id': int(target_user.id), + "start": e['start'], + "end": e['end'], + } + if mention_data not in mentions: + mentions.append(mention_data) - return f"{'@' if not target_user.has_display_name() else ''}{target_user.get_readable_name()}" + return f"{'@' if not target_user.has_display_name() else ''}{target_user.get_readable_name()}" def should_collapse(text, surrounding): if not isinstance(text, str): @@ -255,10 +499,30 @@ def should_collapse(text, surrounding): return False + def sanitize(s): return escape(s.strip().replace('\r\n', '\n').replace('\r', '\n')) -def babycode_to_html(s, banned_tags=[]): + +def babycode_ast(s: str, banned_tags=[]): + """ + transforms a string of babycode into an AST. + the AST is a list of strings or dicts. + + a string element is plain unformatted text. + + a dict element is a node that contains at least the key `type`. + + possible types are: + - bbcode + - bbcode_void + - link + - emote + - rule + - mention + + bbcode type elements have a children key that is a list of children of that node. the children are themselves elements (string or dict). + """ allowed_tags = set(TAGS.keys()) if banned_tags is not None: for tag in banned_tags: @@ -281,44 +545,38 @@ def babycode_to_html(s, banned_tags=[]): ) if not should_collapse(e, surrounding): elements.append(e) + return elements - out = "" - mentions = [] - def fold(element, nobr, surrounding): - if isinstance(element, str): - if nobr: - return element - return break_lines(element) - match element['type']: - case "bbcode": - c = "" - for i in range(len(element['children'])): - child = element['children'][i] - _surrounding = ( - element['children'][i - 1] if i-1 >= 0 else None, - element['children'][i + 1] if i+1 < len(element['children']) else None - ) - _nobr = element['name'] == "code" or element['name'] == "ul" or element['name'] == "ol" - c = c + Markup(fold(child, _nobr, _surrounding)) - res = TAGS[element['name']](c, element['attr'], surrounding) - return res - case "bbcode_void": - return VOID_TAGS[element['name']](element['attr']) - case "link": - return f"{element['url']}" - case 'emote': - return EMOJI[element['name']] - case "rule": - return "
    " - case "mention": - return make_mention(element, mentions) +def babycode_to_html(s: str, banned_tags=[], fragment=False): + """ + transforms a string of babycode into html. - for i in range(len(elements)): - e = elements[i] - surrounding = ( - elements[i - 1] if i-1 >= 0 else None, - elements[i + 1] if i+1 < len(elements) else None - ) - out = out + fold(e, False, surrounding) - return BabycodeParseResult(out, mentions) + parameters: + + s (str) - babycode string + + banned_tags (list) - babycode tags to exclude from being parsed. they will remain as plain text in the transformation. + + fragment (bool) - skip adding an html p tag to the first element if it is inline. + """ + ast = babycode_ast(s, banned_tags) + r = HTMLRenderer(fragment=fragment) + return r.render(ast) + + +def babycode_to_rssxml(s: str, banned_tags=[], fragment=False): + """ + transforms a string of babycode into rss-compatible x/html. + + parameters: + + s (str) - babycode string + + banned_tags (list) - babycode tags to exclude from being parsed. they will remain as plain text in the transformation. + + fragment (bool) - skip adding an html p tag to the first element if it is inline. + """ + ast = babycode_ast(s, banned_tags) + r = RSSXMLRenderer(fragment=fragment) + return r.render(ast)