PyLink/plugins/regex_filter.py

"""
Regex Filter Plugin for PyLink
Comprehensive message filtering with PCRE2 and flood protection
"""
import os
import time
import hashlib
import threading
import pcre2
from collections import defaultdict, deque
from pathlib import Path

from pylinkirc import utils, conf, world
from pylinkirc.classes import *
from pylinkirc.log import log

# Plugin metadata
__version__ = "1.0.0"

# Global variables for thread-safe operation
flood_tracker = defaultdict(lambda: deque(maxlen=50))
similarity_cache = defaultdict(lambda: deque(maxlen=20))
file_watchers = {}
last_reload = 0
filter_lock = threading.RLock()

class Cr():
    pattern = []

def _get_config():
    """Get plugin configuration with defaults"""
    config = conf.conf.get('regex_filter', {})

    defaults = {
        'blacklist_file': './data/regex_blacklist.txt',
        'log_file': './logs/regex_filter.log',
        'flood_window': 30,  # seconds
        'flood_threshold': 3,  # messages
        'similarity_threshold': 0.8,  # 80% similar
        'enabled': True,
        'log_blocked': True,
        'auto_reload': True,
        'debug': False
    }

    # Merge with defaults
    for key, value in defaults.items():
        config.setdefault(key, value)
    return config

def _ensure_directories():
    """Create necessary directories and files"""
    config = _get_config()

    try:
        # Create directories
        for path_key in ['blacklist_file', 'log_file']:
            file_path = Path(config[path_key])
            file_path.parent.mkdir(parents=True, exist_ok=True)

        # Create default blacklist file if missing
        blacklist_path = Path(config['blacklist_file'])
        if not blacklist_path.exists():
            with open(blacklist_path, 'w') as f:
                f.write("# Regex Blacklist Patterns - One regex per line\n")
                f.write("# Test all patterns thoroughly before deploying\n")
                f.write("# Examples (commented out by default):\n")
                f.write("# \\bhttps?://(?:bit\\.ly|tinyurl\\.com|t\\.co)/\\w+\\b\n")
                f.write("# \\b(?:spam|phishing|malware)\\b\n")
                f.write("\n")
            log.info(f"Created default blacklist file: {blacklist_path}")


    except Exception as e:
        log.error(f"Failed to create directories: {e}")

def _compile_regex_filters():
    """Read blacklist_file and compile the patterns"""
    config = _get_config()
    blacklist_file = config['blacklist_file']
    if os.path.isfile(blacklist_file):
        lines_list = []
        with open(blacklist_file, 'r') as file:
            for _, line in enumerate(file, start=1):
                rm_line_break = line.replace('\n', '')
                if rm_line_break[:1] != '#' and len(rm_line_break) != 0:
                    lines_list.append(rm_line_break)
        if len(Cr.pattern) > 0:
            Cr.pattern.clear()
        for r in lines_list:
            try:
                Cr.pattern.append(pcre2.compile(r, jit=True))
            except Exception as e:
                log.error(f"Failed compiling pattern: {e}")
    else:
        log.error(f"{config['blacklist_file']} not found.")

def _log_filter_event(event_type, source, target, reason, network_name="unknown"):
    """Log filtering events"""
    config = _get_config()

    if not config.get('log_blocked', True):
        return

    try:
        log_path = Path(config['log_file'])
        log_path.parent.mkdir(parents=True, exist_ok=True)

        timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
        # Don't log actual content for security reasons
        log_entry = f"[{timestamp}] {network_name} {event_type}: {source} -> {target} | {reason}\\n"

        with open(log_path, 'a') as f:
            f.write(log_entry)

    except Exception as e:
        log.error(f"Failed to write filter log: {e}")

def _check_file_modified(file_path):
    """Check if file has been modified since last check"""
    global last_reload

    try:
        mtime = os.path.getmtime(file_path)
        if mtime > last_reload:
            last_reload = time.time()
            _compile_regex_filters()
            return True
    except OSError:
        pass

    return False

def _get_message_hash(content):
    """Get hash of message content for similarity detection"""
    # Normalize content for similarity checking
    normalized = ''.join(content.lower().split())
    return hashlib.md5(normalized.encode('utf-8')).hexdigest()

def _calculate_similarity(hash1, hash2):
    """Calculate similarity between two message hashes"""
    if len(hash1) != len(hash2):
        return 0.0

    matches = sum(c1 == c2 for c1, c2 in zip(hash1, hash2))
    return matches / len(hash1)

def _check_flood_protection(source, content):
    """Check for flood/spam patterns"""
    config = _get_config()
    current_time = time.time()
    window = config.get('flood_window', 30)
    threshold = config.get('flood_threshold', 3)
    similarity_threshold = config.get('similarity_threshold', 0.8)

    with filter_lock:
        # Clean old entries
        cutoff_time = current_time - window
        flood_tracker[source] = deque(
            [entry for entry in flood_tracker[source] if entry[0] > cutoff_time],
            maxlen=50
        )

        # Add current message
        msg_hash = _get_message_hash(content)
        flood_tracker[source].append((current_time, msg_hash))

        # Check for flood
        recent_messages = flood_tracker[source]
        if len(recent_messages) >= threshold:
            # Check for similar messages
            similar_count = 0
            for _, old_hash in list(recent_messages)[-threshold:]:
                if _calculate_similarity(msg_hash, old_hash) >= similarity_threshold:
                    similar_count += 1

            if similar_count >= threshold:
                return True, "Flood/spam pattern detected"

    return False, ""

def _run_regex(content):
    """Filter content using jit compiled pattern"""
    for p in Cr.pattern:
        try:
            matched = p.match(content)
            if matched:
                return matched, "Content blocked by regex filter"
        except Exception as e:
            log.error(f"Filter error: {e}")
    return False, ''

def _should_filter_content(source, target, content, network_name="unknown"):
    """Main filtering logic"""
    config = _get_config()

    if not config.get('enabled', True):
        return False, ""

    if not content or content.strip() == "":
        return False, ""

    # Auto-reload check
    if config.get('auto_reload', True):
        if _check_file_modified(config['blacklist_file']):
            log.info("Blacklist file updated, patterns reloaded")

    # Check flood protection first
    is_flood, flood_reason = _check_flood_protection(source, content)
    if is_flood:
        return True, flood_reason

    # Check content with filter
    is_blocked, block_reason = _run_regex(content)
    if is_blocked:
        return True, block_reason

    return False, ""

def handle_privmsg(irc, source, command, args):
    """Handle PRIVMSG events"""
    if len(args) < 2:
        return

    target = args['target']
    content = args['text']

    source_id = source
    # Check if content should be filtered
    should_block, reason = _should_filter_content(source_id, target, content, irc.name)

    if should_block:
        # Log the filtering event
        _log_filter_event("PRIVMSG", source_id, target, reason, irc.name)

        # Log to PyLink console
        log.warning(f"Regex Filter ({irc.name}): Blocked PRIVMSG from {source_id} to {target} - {reason}")

        # Block the message by modifying args to empty content
        # This prevents relay while maintaining protocol compliance
        args['text'] = ""
        return

def handle_notice(irc, source, command, args):
    """Handle NOTICE events"""
    if len(args) < 2:
        return

    target = args['target']
    content = args['text']
    source_id = source

    should_block, reason = _should_filter_content(source_id, target, content, irc.name)

    if should_block:
        _log_filter_event("NOTICE", source_id, target, reason, irc.name)
        log.warning(f"Regex Filter ({irc.name}): Blocked NOTICE from {source_id} to {target} - {reason}")
        args['text'] = ""
        return

def handle_part(irc, source, command, args):
    """Handle PART events"""
    if len(args) < 2:
        return

    channel_list = args['channels']
    reason = args['text']
    source_id = source
    for channel in channel_list:
        should_block, block_reason = _should_filter_content(source_id, channel, reason, irc.name)

        if should_block:
            _log_filter_event("PART", source_id, channel, block_reason, irc.name)
            log.warning(f"Regex Filter ({irc.name}): Blocked PART from {source_id} in {channel} - {block_reason}")
            args['text'] = ""
    return

def handle_quit(irc, source, command, args):
    """Handle QUIT events"""
    if len(args) < 1:
        return

    reason = args['text']
    source_id = source
    should_block, block_reason = _should_filter_content(source_id, "QUIT", reason, irc.name)

    if should_block:
        _log_filter_event("QUIT", source_id, "QUIT", block_reason, irc.name)
        log.warning(f"Regex Filter ({irc.name}): Blocked QUIT from {source_id} - {block_reason}")
        args['text'] = ""
        return

def handle_topic(irc, source, command, args):
    """Handle TOPIC events"""
    if len(args) < 2:
        return

    channel = args['channel']
    topic = args['text']
    source_id = source
    should_block, reason = _should_filter_content(source_id, channel, topic, irc.name)

    if should_block:
        _log_filter_event("TOPIC", source_id, channel, reason, irc.name)
        log.warning(f"Regex Filter ({irc.name}): Blocked TOPIC from {source_id} in {channel} - {reason}")
        args['topic'] = ""
        return

def handle_kick(irc, source, command, args):
    """Handle KICK events"""
    if len(args) < 3:
        return

    channel = args['channel']
    kicked_user = args['target']
    reason = args['text']
    source_id = source
    target = f"{channel}:{kicked_user}"

    should_block, block_reason = _should_filter_content(source_id, target, reason, irc.name)

    if should_block:
        _log_filter_event("KICK", source_id, target, block_reason, irc.name)
        log.warning(f"Regex Filter ({irc.name}): Blocked KICK from {source_id} in {channel} - {block_reason}")
        args['text'] = ""
        return

def handle_nick(irc, source, command, args):
    """Handle NICK events"""
    if len(args) < 1:
        return

    new_nick = args['newnick']
    source_id = source

    should_block, reason = _should_filter_content(source_id, "NICK", new_nick, irc.name)

    if should_block:
        _log_filter_event("NICK", source_id, "NICK", reason, irc.name)
        log.warning(f"Regex Filter ({irc.name}): Blocked NICK change from {source_id} - {reason}")
        args['newnick'] = f"Filtered{int(time.time())}"  # Replace with safe nick
        return

def handle_join(irc, source, command, args):
    """Handle JOIN events"""
    if len(args) < 1:
        return

    channel = args['channel']
    source_id = source
    should_block, reason = _should_filter_content(source_id, "JOIN", channel, irc.name)

    if should_block:
        _log_filter_event("JOIN", source_id, channel, reason, irc.name)
        log.warning(f"Regex Filter ({irc.name}): Blocked JOIN from {source_id} to {channel} - {reason}")
        # For JOIN, we can't easily modify the channel name without breaking protocol
        # Instead, log it for monitoring
        return

def handle_away(irc, source, command, args):
    """Handle AWAY events"""
    if len(args) < 1:
        return

    reason = args['text']
    source_id = source
    should_block, block_reason = _should_filter_content(source_id, "AWAY", reason, irc.name)

    if should_block:
        _log_filter_event("AWAY", source_id, "AWAY", block_reason, irc.name)
        log.warning(f"Regex Filter ({irc.name}): Blocked AWAY from {source_id} - {block_reason}")
        args['text'] = ""
        return

def main(irc=None):
    """Plugin initialization"""
    try:
        _ensure_directories()
        _compile_regex_filters()
        # Register hooks for all message types
        utils.add_hook(handle_privmsg, 'PRIVMSG', priority=1050)
        utils.add_hook(handle_notice, 'NOTICE', priority=1050)
        utils.add_hook(handle_part, 'PART', priority=1050)
        utils.add_hook(handle_quit, 'QUIT', priority=1050)
        utils.add_hook(handle_topic, 'TOPIC', priority=1050)
        utils.add_hook(handle_kick, 'KICK', priority=1050)
        utils.add_hook(handle_nick, 'NICK', priority=1050)
        utils.add_hook(handle_join, 'JOIN', priority=1050)
        utils.add_hook(handle_away, 'AWAY', priority=1050)

        log.info("Regex Filter plugin loaded successfully")

    except Exception as e:
        log.error(f"Failed to initialize Regex Filter plugin: {e}")

def die(irc=None):
    """Plugin cleanup"""
    try:
        # Clear flood tracking data
        global flood_tracker, similarity_cache
        flood_tracker.clear()
        similarity_cache.clear()

        log.info("Regex Filter plugin unloaded")

    except Exception as e:
        log.error(f"Error during Regex Filter plugin cleanup: {e}")