[test] fix html parser(expandable)

pull/138/head
ZetGo | Aleksej K. 2025-05-25 15:32:11 +03:00
parent 46e7ab7cf7
commit 4dde142db4
2 changed files with 152 additions and 0 deletions

146
heroku/tl/utils.py 100644
View File

@ -0,0 +1,146 @@
"""
Simple HTML -> Telegram entity parser.
"""
import struct
from collections import deque
from html.parser import HTMLParser
from typing import Tuple, List
from herokutl import helpers
from herokutl.tl.types import (
MessageEntityBold, MessageEntityItalic, MessageEntityCode,
MessageEntityPre, MessageEntityEmail, MessageEntityUrl,
MessageEntityTextUrl, MessageEntityUnderline,
MessageEntityStrike, MessageEntityBlockquote,
TypeMessageEntity, MessageEntityCustomEmoji, MessageEntitySpoiler
)
# Helpers from markdown.py
def _add_surrogate(text):
return ''.join(
''.join(chr(y) for y in struct.unpack('<HH', x.encode('utf-16le')))
if (0x10000 <= ord(x) <= 0x10FFFF) else x for x in text
)
def _del_surrogate(text):
return text.encode('utf-16', 'surrogatepass').decode('utf-16')
class HTMLToTelegramParser(HTMLParser):
def __init__(self):
super().__init__()
self.text = ''
self.entities = []
self._building_entities = {}
self._open_tags = deque()
self._open_tags_meta = deque()
def handle_starttag(self, tag, attrs):
self._open_tags.appendleft(tag)
self._open_tags_meta.appendleft(None)
attrs = dict(attrs)
EntityType = None
args = {}
if tag in ["strong", "b"]:
EntityType = MessageEntityBold
elif tag in ["em", "i"]:
EntityType = MessageEntityItalic
elif tag in ["tg-spoiler"]:
EntityType = MessageEntitySpoiler
elif tag == 'u':
EntityType = MessageEntityUnderline
elif tag in ["del", "s"]:
EntityType = MessageEntityStrike
elif tag == 'blockquote':
EntityType = MessageEntityBlockquote
args["collapsed"] = True if attrs.get('expandable') else False
elif tag == 'code':
try:
# If we're in the middle of a <pre> tag, this <code> tag is
# probably intended for syntax highlighting.
#
# Syntax highlighting is set with
# <code class='language-...'>codeblock</code>
# inside <pre> tags
pre = self._building_entities['pre']
try:
pre.language = attrs['class'][len('language-'):]
except KeyError:
pass
except KeyError:
EntityType = MessageEntityCode
elif tag == 'pre':
EntityType = MessageEntityPre
args["language"] = ''
elif tag == 'a':
try:
url = attrs['href']
except KeyError:
return
if url.startswith('mailto:'):
url = url[len('mailto:'):]
EntityType = MessageEntityEmail
else:
if self.get_starttag_text() == url:
EntityType = MessageEntityUrl
else:
EntityType = MessageEntityTextUrl
args['url'] = _del_surrogate(url)
url = None
self._open_tags_meta.popleft()
self._open_tags_meta.appendleft(url)
elif tag == "emoji" and CUSTOM_EMOJIS:
EntityType = MessageEntityCustomEmoji
args["document_id"] = int(attrs["document_id"])
if EntityType and tag not in self._building_entities:
self._building_entities[tag] = EntityType(
offset=len(self.text),
# The length will be determined when closing the tag.
length=0,
**args)
def handle_data(self, text):
previous_tag = self._open_tags[0] if len(self._open_tags) > 0 else ''
if previous_tag == 'a':
url = self._open_tags_meta[0]
if url:
text = url
for tag, entity in self._building_entities.items():
entity.length += len(text)
self.text += text
def handle_endtag(self, tag):
try:
self._open_tags.popleft()
self._open_tags_meta.popleft()
except IndexError:
pass
entity = self._building_entities.pop(tag, None)
if entity:
self.entities.append(entity)
def parse(html: str) -> Tuple[str, List[TypeMessageEntity]]:
"""
Parses the given HTML message and returns its stripped representation
plus a list of the MessageEntity's that were found.
:param html: the message with HTML to be parsed.
:return: a tuple consisting of (clean message, [message entities]).
"""
if not html:
return html, []
parser = HTMLToTelegramParser()
parser.feed(_add_surrogate(html))
text = helpers.strip_text(parser.text, parser.entities)
return _del_surrogate(text), parser.entities
CUSTOM_EMOJIS = True # Can be disabled externally

View File

@ -51,6 +51,8 @@ import emoji
import git import git
import grapheme import grapheme
import herokutl import herokutl
import herokutl.extensions
import herokutl.extensions.html
import requests import requests
from aiogram.types import Message as AiogramMessage from aiogram.types import Message as AiogramMessage
from herokutl import hints from herokutl import hints
@ -104,6 +106,10 @@ from ._internal import fw_protect
from .inline.types import BotInlineCall, InlineCall, InlineMessage from .inline.types import BotInlineCall, InlineCall, InlineMessage
from .tl_cache import CustomTelegramClient from .tl_cache import CustomTelegramClient
from .types import HerokuReplyMarkup, ListLike, Module from .types import HerokuReplyMarkup, ListLike, Module
############# Temporary bypass of the original parser
from .tl.utils import parse
herokutl.extensions.html.parse = parse
############# Issue: https://github.com/LonamiWebs/Telethon/issues/4627
FormattingEntity = typing.Union[ FormattingEntity = typing.Union[
MessageEntityUnknown, MessageEntityUnknown,