1.0.27: New requirement - grapheme. New `utils.smart_split`, authored by t.me/bsolute. Minor improvements

2022-04-10 21:44:20 +00:00 · 2022-04-10 21:44:20 +00:00 · eea3d2f3e5
parent fdef2634de
commit eea3d2f3e5
5 changed files with 183 additions and 61 deletions
--- a/hikka/main.py
+++ b/hikka/main.py
@ -29,6 +29,7 @@
 import sys
 import getpass
 import os
+import subprocess

 if (
    getpass.getuser() == "root"
@ -46,6 +47,30 @@ if (
    if input("> ").lower() != "force_insecure":
        sys.exit(1)

+
+def deps(e):
+    print(
+        "Error: you have not installed all dependencies correctly.\n"
+        f"{str(e)}\n"
+        "Attempting dependencies installation... Just wait."
+    )
+
+    subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "pip",
+            "install",
+            "--upgrade",
+            "-q",
+            "--disable-pip-version-check",
+            "--no-warn-script-location",
+            "-r",
+            "requirements.txt",
+        ]
+    )
+
+
 if sys.version_info < (3, 8, 0):
    print("Error: you must use at least Python version 3.8.0")  # pragma: no cover
 elif __package__ != "hikka":  # In case they did python __main__.py
@ -53,21 +78,29 @@ elif __package__ != "hikka":  # In case they did python __main__.py
        "Error: you cannot run this as a script; you must execute as a package"
    )  # pragma: no cover
 else:
-    from . import log
+    try:
+        from . import log

-    log.init()
+        log.init()
+    except ModuleNotFoundError as e:  # pragma: no cover
+        deps(e)
+        try:
+            from . import log
+
+            log.init()
+        except ModuleNotFoundError as e2:
+            print(
+                "Error while installing dependencies. Please, do this manually!\n"
+                f"{str(e2)}\n"
+                "pip3 install -r requirements.txt"
+            )
+
+            sys.exit(1)

    try:
        from . import main
    except ModuleNotFoundError as e:  # pragma: no cover
-        print(
-            "Error: you have not installed all dependencies correctly.\n"
-            f"{str(e)}\n"
-            "Attempting dependencies installation... Just wait."
-        )
-
-        os.popen("pip3 install -r requirements.txt").read()
-
+        deps(e)
        try:
            from . import main
        except ModuleNotFoundError as e2:
--- a/hikka/inline/utils.py
+++ b/hikka/inline/utils.py
@ -10,6 +10,7 @@ import logging
 from typing import Union
 from types import FunctionType
 from .. import security
+from .._types import Module
 import inspect

 logger = logging.getLogger(__name__)
@ -168,7 +169,7 @@ class Utils(InlineUnit):
            return next(
                next(
                    lambda: self._db.get(security.__name__, "masks", {}).get(
-                        f"{getattr(cls_, stack_entry.function).__module__}.{getattr(cls_, stack_entry.function).__name__}",
+                        f"{getattr(cls_, stack_entry.function).__module__}.{stack_entry.function}",
                        getattr(
                            getattr(cls_, stack_entry.function),
                            "security",
@ -176,7 +177,7 @@ class Utils(InlineUnit):
                        ),
                    )
                    for name, cls_ in stack_entry.frame.f_globals.items()
-                    if name.endswith("Mod") and hasattr(cls_, "strings")
+                    if name.endswith("Mod") and issubclass(cls_, Module)
                )
                for stack_entry in inspect.stack()
                if hasattr(stack_entry, "function")
--- a/hikka/utils.py
+++ b/hikka/utils.py
@ -47,7 +47,7 @@ from telethon.tl.types import (
    MessageEntityMentionName,
 )

-import copy
+import grapheme

 from telethon.hints import Entity

@ -293,8 +293,9 @@ async def answer(
            try:
                list_ = await message.client.loader.inline.list(
                    message=message,
-                    strings=smart_split(text, 4096),
+                    strings=list(smart_split(text, entity, 4096)),
                )
+
                if not message.client.loader.inline.init_complete or not list_:
                    raise

@ -575,54 +576,140 @@ def rand(size: int, /) -> str:
    )


-def change_attribute(obj, attribute: str, value: str):
-    object_ = copy.deepcopy(obj)
-    setattr(object_, attribute, value)
-    return object_
+def smart_split(text, entities, length=4096, split_on=("\n", " "), min_length=1):
+    """
+    Split the message into smaller messages.
+    A grapheme will never be broken. Entities will be displaced to match the right location. No inputs will be mutated.
+    The end of each message except the last one is stripped of characters from [split_on]
+    :param text: the plain text input
+    :param entities: the entities
+    :param length: the maximum length of a single message
+    :param split_on: characters (or strings) which are preferred for a message break
+    :param min_length: ignore any matches on [split_on] strings before this number of characters into each message
+    :return:
+    """
+
+    # Authored by @bsolute
+    # https://t.me/LonamiWebs/27777
+
+    encoded = text.encode("utf-16le")
+    pending_entities = entities
+    text_offset = 0
+    bytes_offset = 0
+    text_length = len(text)
+    bytes_length = len(encoded)
+    while text_offset < text_length:
+        if bytes_offset + length * 2 >= bytes_length:
+            yield parser.unparse(
+                text[text_offset:],
+                list(sorted(pending_entities, key=lambda x: x.offset)),
+            )
+            break
+        codepoint_count = len(
+            encoded[bytes_offset : bytes_offset + length * 2].decode(
+                "utf-16le",
+                errors="ignore",
+            )
+        )
+        for search in split_on:
+            search_index = text.rfind(
+                search,
+                text_offset + min_length,
+                text_offset + codepoint_count,
+            )
+            if search_index != -1:
+                break
+        else:
+            search_index = text_offset + codepoint_count
+        split_index = grapheme.safe_split_index(text, search_index)
+        assert split_index > text_offset
+        split_offset_utf16 = (
+            len(text[text_offset:split_index].encode("utf-16le"))
+        ) // 2
+        exclude = 0
+        while (
+            split_index + exclude < text_length
+            and text[split_index + exclude] in split_on
+        ):
+            exclude += 1
+        current_entities = []
+        entities = pending_entities.copy()
+        pending_entities = []
+        for entity in entities:
+            if (
+                entity.offset < split_offset_utf16
+                and entity.offset + entity.length > split_offset_utf16 + exclude
+            ):
+                # spans boundary
+                current_entities.append(
+                    _copy_tl(
+                        entity,
+                        length=split_offset_utf16 - entity.offset,
+                    )
+                )
+                pending_entities.append(
+                    _copy_tl(
+                        entity,
+                        offset=0,
+                        length=entity.offset
+                        + entity.length
+                        - split_offset_utf16
+                        - exclude,
+                    )
+                )
+            elif entity.offset < split_offset_utf16 < entity.offset + entity.length:
+                # overlaps boundary
+                current_entities.append(
+                    _copy_tl(
+                        entity,
+                        length=split_offset_utf16 - entity.offset,
+                    )
+                )
+            elif entity.offset < split_offset_utf16:
+                # wholly left
+                current_entities.append(entity)
+            elif (
+                entity.offset + entity.length
+                > split_offset_utf16 + exclude
+                > entity.offset
+            ):
+                # overlaps right boundary
+                pending_entities.append(
+                    _copy_tl(
+                        entity,
+                        offset=0,
+                        length=entity.offset
+                        + entity.length
+                        - split_offset_utf16
+                        - exclude,
+                    )
+                )
+            elif entity.offset + entity.length > split_offset_utf16 + exclude:
+                # wholly right
+                pending_entities.append(
+                    _copy_tl(
+                        entity,
+                        offset=entity.offset - split_offset_utf16 - exclude,
+                    )
+                )
+            else:
+                assert entity.length <= exclude
+                # ignore entity in whitespace
+        current_text = text[text_offset:split_index]
+        yield parser.unparse(
+            current_text,
+            list(sorted(current_entities, key=lambda x: x.offset)),
+        )
+        text_offset = split_index + exclude
+        bytes_offset += len(current_text.encode("utf-16le"))
+        assert bytes_offset % 2 == 0


-def smart_split(text: str, chunk_size: int) -> List[str]:
-    text = emoji_pattern.sub(r"", text)
-    text, entities = parser.parse(text)
-    result = []
-
-    chunk_begin_offset = 0
-
-    for chunk in chunks(text, chunk_size):
-        chunk_end_offset = chunk_begin_offset + chunk_size
-        # Find all entities which are located in this chunk in particular
-        this_chunk_entities = [
-            copy.deepcopy(entity)
-            for entity in entities
-            if entity.offset + entity.length > chunk_begin_offset
-            and entity.offset < chunk_end_offset
-        ]
-
-        for entity in this_chunk_entities:
-            # If entity starts *before* the chunk
-            if entity.offset < chunk_begin_offset:
-                if entity.offset + entity.length in range(
-                    chunk_begin_offset,
-                    chunk_end_offset + 1,
-                ):
-                    # Entity ends *inside* of the chunk
-                    entity.length = entity.offset + entity.length - chunk_begin_offset
-                else:
-                    # Entity ends *outside* of the chunk
-                    entity.length = chunk_size
-                entity.offset = 0
-            # If entity starts *inside* of chunk
-            elif entity.offset in range(chunk_begin_offset, chunk_end_offset + 1):
-                entity.offset -= chunk_begin_offset
-                if entity.length > chunk_size - entity.offset:
-                    entity.length = chunk_size - entity.offset
-
-        this_chunk_entities.sort(key=lambda x: x.offset)
-
-        result += [[chunk, this_chunk_entities]]
-        chunk_begin_offset += chunk_size
-
-    return [parser.unparse(*i) for i in result]
+def _copy_tl(o, **kwargs):
+    d = o.to_dict()
+    del d["_"]
+    d.update(kwargs)
+    return o.__class__(**d)


 init_ts = time.perf_counter()
--- a/hikka/version.py
+++ b/hikka/version.py
@ -1 +1 @@
-__version__ = (1, 0, 26)
+__version__ = (1, 0, 27)
--- a/requirements.txt
+++ b/requirements.txt
@ -9,5 +9,6 @@ Jinja2==3.0.3
 requests==2.27.1
 aiogram==2.19
 websockets==10.2
+grapheme==0.6.0

 # Python 3.8+