import os
import re
import argparse
import shutil
from datetime import datetime

"""
clean_js_agent.py
------------------
Scans JavaScript (and optionally HTML/PHP) files for patterns commonly used by
JS:Agent-ENC trojans (obfuscated bootstrapper that fetches/executes payloads),
then optionally removes those injected blocks while creating backups.

⚠️ Disclaimer: Heuristics may remove some legitimate minified/obfuscated code.
Run in --dry-run mode first (default) and review the report before applying --fix.
"""

# --- Suspicious pattern detectors -------------------------------------------------

SUSPICIOUS_REGEXES = {
    # Gate often used by injectors: if(typeof X==="undefined"){ ... }
    'typeof_undefined_gate': re.compile(
        r";\s*if\s*\(\s*typeof\s+[A-Za-z_$][\w$]*\s*===\s*['\"]undefined['\"]\s*\)\s*\{",
        re.DOTALL
    ),
    # Specific to your sample ("nqdq" gate). Keep this to increase precision.
    'nqdq_gate': re.compile(
        r";\s*if\s*\(\s*typeof\s+nqdq\s*===\s*['\"]undefined['\"]\s*\)\s*\{",
        re.DOTALL
    ),
    # Obfuscation trick: while(!![]) loop
    'while_true_array': re.compile(r"while\s*\(\s*!!\[\]\s*\)", re.DOTALL),
    # Heavy parseInt noise within try/catch loop
    'parseInt_noise': re.compile(r"try\s*\{[^}]*parseInt\([^)]*\)[^}]*\}", re.DOTALL),
    # Custom Base64/URL decoders combined with fromCharCode/atob
    'custom_decoder': re.compile(
        r"fromCharCode[\s\S]{0,400}decodeURIComponent|atob\s*\([^)]*\)\s*;?",
        re.IGNORECASE
    ),
    # Encoded strings presence
    'encoded_hex_unicode': re.compile(r"\\x[0-9A-Fa-f]{2}|\\u[0-9A-Fa-f]{4}"),
    # XMLHttpRequest wrapper + token/rand beacons (seen in sample)
    'xmlhttp_wrapper': re.compile(
        r"XMLHttpRequest[\s\S]{0,600}token\s*\(\)|rand\s*\(\)",
        re.IGNORECASE
    ),
    # Self-modifying/dynamic execution
    'eval_function': re.compile(r"\beval\s*\(|\bnew\s+Function\s*\(", re.IGNORECASE),
    # Sample identifiers often present in this injection
    'sample_idents': re.compile(r"\b(a0E|a0l|HttpClient|token|rand)\b"),
}

AGENT_START_PATTERNS = [
    re.compile(r";\s*if\s*\(\s*typeof\s+[A-Za-z_$][\w$]*\s*===\s*['\"]undefined['\"]\s*\)\s*\{", re.DOTALL),
    re.compile(r"\(function\s*\([^)]*\)\s*\{", re.DOTALL),  # generic IIFE start
]

# --- Helpers ---------------------------------------------------------------------

def find_suspicious_indicators(text):
    flags = []
    for name, rx in SUSPICIOUS_REGEXES.items():
        if rx.search(text):
            flags.append(name)
    return flags

def remove_block_by_brace_balance(text, start_idx):
    """
    Given an index near a block start, remove the balanced block until its matching '}'.
    Returns (new_text, removed) where removed is True/False.
    """
    # Find the first '{' from start_idx
    brace_start = text.find('{', start_idx)
    if brace_start == -1:
        return text, False
    depth = 0
    i = brace_start
    while i < len(text):
        ch = text[i]
        if ch == '{':
            depth += 1
        elif ch == '}':
            depth -= 1
            if depth == 0:
                # Include the closing brace
                end_idx = i + 1
                new_text = text[:start_idx] + text[end_idx:]
                return new_text, True
        i += 1
    return text, False

def remove_obfuscated_iife(text):
    """
    Attempt to remove the common obfuscated JS:Agent-ENC injection block.
    Finds a gate like `if(typeof X==="undefined"){...}` or an anonymous IIFE and
    removes its full balanced block. Repeats up to 5 times in case of multiple injections.
    """
    removed_any = False
    cur_text = text
    for _ in range(5):
        start_match = None
        # Prefer exact 'nqdq' gate removal if present
        m_exact = SUSPICIOUS_REGEXES['nqdq_gate'].search(cur_text)
        if m_exact:
            start_match = m_exact
        else:
            for rx in AGENT_START_PATTERNS:
                m = rx.search(cur_text)
                if m:
                    start_match = m
                    break
        if not start_match:
            break
        start_idx = start_match.start()
        new_text, removed = remove_block_by_brace_balance(cur_text, start_idx)
        if removed:
            cur_text = new_text
            removed_any = True
        else:
            break
    return cur_text, removed_any

# --- Core scanning/cleaning logic -------------------------------------------------

def process_file(path, root, fix=False, backup_dir=None):
    try:
        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
    except Exception as e:
        return {'path': path, 'error': f'read_error: {e}'}

    indicators = find_suspicious_indicators(content)

    cleaned = False
    new_content = content

    if fix and indicators:
        # Optionally back up
        if backup_dir:
            rel = os.path.relpath(path, start=root)
            backup_path = os.path.join(backup_dir, rel)
            os.makedirs(os.path.dirname(backup_path), exist_ok=True)
            try:
                shutil.copy2(path, backup_path)
            except Exception:
                pass  # Continue even if backup fails, but we already created the structure

        # Try structured removal first
        new_content, removed = remove_obfuscated_iife(content)
        if removed:
            cleaned = True
        else:
            # Fallback: If strong indicators and presence of while(!![]), remove surrounding IIFE (function(){...})();)
            m = SUSPICIOUS_REGEXES['while_true_array'].search(content)
            if m:
                start_func = content.rfind('(function', 0, m.start())
                end_call = content.find('})();', m.end())
                if start_func != -1 and end_call != -1:
                    new_content = content[:start_func] + content[end_call+4:]
                    cleaned = True
            # Extra attempt: strip segment around XMLHttpRequest beacon by balanced braces
            if not cleaned and SUSPICIOUS_REGEXES['xmlhttp_wrapper'].search(content):
                # Approx: remove block containing first XMLHttpRequest occurrence
                m2 = SUSPICIOUS_REGEXES['xmlhttp_wrapper'].search(content)
                if m2:
                    # Find nearest preceding '{' to start removal
                    brace_prev = content.rfind('{', 0, m2.start())
                    if brace_prev != -1:
                        new_content, removed2 = remove_block_by_brace_balance(content, brace_prev)
                        cleaned = cleaned or removed2

    # If cleaned, write back
    if fix and cleaned:
        try:
            with open(path, 'w', encoding='utf-8', errors='ignore') as f:
                f.write(new_content)
        except Exception as e:
            return {'path': path, 'error': f'write_error: {e}', 'indicators': indicators, 'cleaned': False}

    return {'path': path, 'indicators': indicators, 'cleaned': cleaned}

# --- CLI -------------------------------------------------------------------------

def iter_files(root, exts):
    for dirpath, dirnames, filenames in os.walk(root):
        for fn in filenames:
            if any(fn.lower().endswith(ext) for ext in exts):
                yield os.path.join(dirpath, fn)

def main(args):
    exts = ['.js']
    if args.include_html:
        exts += ['.html', '.htm', '.php']

    # Prepare backup directory if fixing
    backup_dir = None
    if args.fix:
        ts = datetime.now().strftime('%Y%m%d_%H%M%S')
        backup_dir = os.path.join(args.root, f"backup_js_agent_{ts}")
        os.makedirs(backup_dir, exist_ok=True)

    total = 0
    infected = 0
    cleaned = 0
    results = []

    for path in iter_files(args.root, exts):
        total += 1
        r = process_file(path, root=args.root, fix=args.fix, backup_dir=backup_dir)
        results.append(r)
        indicators = r.get('indicators', [])
        if indicators:
            infected += 1
        if r.get('cleaned'):
            cleaned += 1

    # Reporting
    print(f"\nScan complete. Files scanned: {total}")
    print(f"Files with suspicious indicators: {infected}")
    if args.fix:
        print(f"Files cleaned: {cleaned}")
        print(f"Backups saved to: {backup_dir}")

    # Detailed listing
    for r in results:
        path = r['path']
        inds = r.get('indicators', [])
        status = 'CLEANED' if r.get('cleaned') else ('INFECTED' if inds else 'OK')
        err = r.get('error')
        if err:
            print(f"[ERROR] {path}: {err}")
        else:
            if inds:
                print(f"[{status}] {path} -> {', '.join(inds)}")
            else:
                print(f"[{status}] {path}")

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Scan and optionally clean JS:Agent-ENC style injections in JS/HTML/PHP files.'
    )
    parser.add_argument('root', help='Root directory to scan')
    parser.add_argument('--fix', action='store_true', help='Apply cleaning to infected files (creates backups)')
    parser.add_argument('--include-html', action='store_true', help='Also scan HTML/HTM/PHP files for inline JS')
    args = parser.parse_args()
    main(args)