From 4dde142db41b1c7c0d83a4a1eef0396995749334 Mon Sep 17 00:00:00 2001 From: "ZetGo | Aleksej K." <73279716+ZetGoHack@users.noreply.github.com> Date: Sun, 25 May 2025 15:32:11 +0300 Subject: [PATCH] [test] fix html parser(expandable) --- heroku/tl/utils.py | 146 +++++++++++++++++++++++++++++++++++++++++++++ heroku/utils.py | 6 ++ 2 files changed, 152 insertions(+) create mode 100644 heroku/tl/utils.py diff --git a/heroku/tl/utils.py b/heroku/tl/utils.py new file mode 100644 index 0000000..4d784c4 --- /dev/null +++ b/heroku/tl/utils.py @@ -0,0 +1,146 @@ +""" +Simple HTML -> Telegram entity parser. +""" +import struct +from collections import deque +from html.parser import HTMLParser +from typing import Tuple, List + +from herokutl import helpers +from herokutl.tl.types import ( + MessageEntityBold, MessageEntityItalic, MessageEntityCode, + MessageEntityPre, MessageEntityEmail, MessageEntityUrl, + MessageEntityTextUrl, MessageEntityUnderline, + MessageEntityStrike, MessageEntityBlockquote, + TypeMessageEntity, MessageEntityCustomEmoji, MessageEntitySpoiler +) + + +# Helpers from markdown.py +def _add_surrogate(text): + return ''.join( + ''.join(chr(y) for y in struct.unpack(' tag, this tag is + # probably intended for syntax highlighting. + # + # Syntax highlighting is set with + # codeblock + # inside
 tags
+                pre = self._building_entities['pre']
+                try:
+                    pre.language = attrs['class'][len('language-'):]
+                except KeyError:
+                    pass
+            except KeyError:
+                EntityType = MessageEntityCode
+        elif tag == 'pre':
+            EntityType = MessageEntityPre
+            args["language"] = ''
+        elif tag == 'a':
+            try:
+                url = attrs['href']
+            except KeyError:
+                return
+            if url.startswith('mailto:'):
+                url = url[len('mailto:'):]
+                EntityType = MessageEntityEmail
+            else:
+                if self.get_starttag_text() == url:
+                    EntityType = MessageEntityUrl
+                else:
+                    EntityType = MessageEntityTextUrl
+                    args['url'] = _del_surrogate(url)
+                    url = None
+            self._open_tags_meta.popleft()
+            self._open_tags_meta.appendleft(url)
+        elif tag == "emoji" and CUSTOM_EMOJIS:
+            EntityType = MessageEntityCustomEmoji
+            args["document_id"] = int(attrs["document_id"])
+
+        if EntityType and tag not in self._building_entities:
+            self._building_entities[tag] = EntityType(
+                offset=len(self.text),
+                # The length will be determined when closing the tag.
+                length=0,
+                **args)
+
+    def handle_data(self, text):
+        previous_tag = self._open_tags[0] if len(self._open_tags) > 0 else ''
+        if previous_tag == 'a':
+            url = self._open_tags_meta[0]
+            if url:
+                text = url
+
+        for tag, entity in self._building_entities.items():
+            entity.length += len(text)
+
+        self.text += text
+
+    def handle_endtag(self, tag):
+        try:
+            self._open_tags.popleft()
+            self._open_tags_meta.popleft()
+        except IndexError:
+            pass
+        entity = self._building_entities.pop(tag, None)
+        if entity:
+            self.entities.append(entity)
+
+
+def parse(html: str) -> Tuple[str, List[TypeMessageEntity]]:
+    """
+    Parses the given HTML message and returns its stripped representation
+    plus a list of the MessageEntity's that were found.
+
+    :param html: the message with HTML to be parsed.
+    :return: a tuple consisting of (clean message, [message entities]).
+    """
+    if not html:
+        return html, []
+
+    parser = HTMLToTelegramParser()
+    parser.feed(_add_surrogate(html))
+    text = helpers.strip_text(parser.text, parser.entities)
+    return _del_surrogate(text), parser.entities
+
+
+CUSTOM_EMOJIS = True  # Can be disabled externally
\ No newline at end of file
diff --git a/heroku/utils.py b/heroku/utils.py
index 318f581..f9acf3b 100644
--- a/heroku/utils.py
+++ b/heroku/utils.py
@@ -51,6 +51,8 @@ import emoji
 import git
 import grapheme
 import herokutl
+import herokutl.extensions
+import herokutl.extensions.html
 import requests
 from aiogram.types import Message as AiogramMessage
 from herokutl import hints
@@ -104,6 +106,10 @@ from ._internal import fw_protect
 from .inline.types import BotInlineCall, InlineCall, InlineMessage
 from .tl_cache import CustomTelegramClient
 from .types import HerokuReplyMarkup, ListLike, Module
+############# Temporary bypass of the original parser
+from .tl.utils import parse
+herokutl.extensions.html.parse = parse
+############# Issue: https://github.com/LonamiWebs/Telethon/issues/4627
 
 FormattingEntity = typing.Union[
     MessageEntityUnknown,