#!/usr/bin/env python3
# Runs under both Python 2 and Python 3: preserve this property!
# SPDX-License-Identifier: GPL-2.0+
"""
cvsstrip - skeletonize CVS master files

Called as a filter, skeletonizes a CVS master presented on standard input
and write it to standard output. If an argument is specified, it must be
the name of a directory containing CVS master files; in that case a
corresponding directory of stripped files is created.

Options:
   -c      Suppress replacement of revision content.
   -o dir  Set name of output directory. Defaults to the input dirname
           with the suffix '-reduced'.
   -i      Interactive mode. Experimental, read the embedded documentation. 
   -l      Suppress replacement of log content with a hash.
   -t      Suppress stripping of (non-sticky) tags.  Sticky tags are
           always preserved.
   -v      Enable progress messages.

Default behavior is to strip non-sticky tags, replace each version
of content with a unique string including the revision ID, and
replace log text with its MD5 hash in hex.

The only identifying information left in the tree is filenames and CVS
user IDs.

The intent is to discard bulky content but preserve all metadata
relevant to changeset collation. A collection of stripped files should
imply the same changeset DAG as the unstripped originals, but be
easier to pass around, faster to process, and not reveal potentially
sensitive data.
"""

# pylint: disable=line-too-long,invalid-name,missing-function-docstring,no-else-continue,too-many-branches,consider-using-f-string,consider-using-with.raise-missing-from,too-many-instance-attributes,too-many-arguments,too-many-nested-blocks

# pylint: disable=multiple-imports
import os, sys, getopt, hashlib, io, shutil, cmd, subprocess, time

strip_tags = True
strip_logs = True
strip_content = True
verbose = 0

# Any encoding that preserves 0x80...0x8f through round-tripping from byte
# streams to Unicode and back would do, latin-1 is the best known of these.

binary_encoding = "latin-1"

if str is bytes:  # Python 2

    polystr = str
    polybytes = bytes
    polyord = ord
    polychr = str

else:  # Python 3

    def polystr(o):
        if isinstance(o, str):
            return o
        if isinstance(o, bytes):
            return str(o, encoding=binary_encoding)
        raise ValueError

    def polybytes(o):
        if isinstance(o, bytes):
            return o
        if isinstance(o, str):
            return bytes(o, encoding=binary_encoding)
        raise ValueError

    def polyord(c):
        "Polymorphic ord() function"
        if isinstance(c, str):
            return ord(c)
        return c

    def polychr(c):
        "Polymorphic chr() function"
        if isinstance(c, int):
            return chr(c)
        return c

    def make_std_wrapper(stream):
        "Standard input/output wrapper factory function"
        # This ensures that the encoding of standard output and standard
        # error on Python 3 matches the binary encoding we use to turn
        # bytes to Unicode in polystr above

        # newline="\n" ensures that Python 3 won't mangle line breaks
        # line_buffering=True ensures that interactive command sessions work as expected
        return io.TextIOWrapper(
            stream.buffer, encoding=binary_encoding, newline="\n", line_buffering=True
        )

    sys.stdin = make_std_wrapper(sys.stdin)
    sys.stdout = make_std_wrapper(sys.stdout)
    sys.stderr = make_std_wrapper(sys.stderr)


def replace_escaped_text(inputf, replacement, outputf):
    "Replace text between @ delimiters with a specified string."
    leader = polystr(inputf.read(1))
    if leader != "@":
        sys.stderr.write("cvsstrip: fatal error, @ leader not where expected.\n")
        sys.exit(1)
    else:
        outputf.write(polybytes("@" + replacement.replace("@", r"@@")))
    while True:
        nxt = polystr(inputf.read(1))
        if nxt == "@":
            nxt2 = polystr(inputf.read(1))
            if nxt2 == "@":
                continue
            else:
                break
    if nxt2 == "\n":
        outputf.write(polybytes("@\n"))
    else:
        sys.stderr.write(
            "cvsstrip: fatal error, @ trailer not followed by newline (%s).\n" % nxt2
        )
        sys.exit(1)


def hash_escaped_text(inputf, outputf):
    "Replace text between @ delimiters with its MD5 hash."
    leader = polystr(inputf.read(1))
    if leader != "@":
        sys.stderr.write("cvsstrip: fatal error, @ leader not where expected.\n")
        sys.exit(1)
    txt = ""
    while True:
        nxt = polystr(inputf.read(1))
        if nxt == "@":
            nxt2 = polystr(inputf.read(1))
            if nxt2 == "@":
                txt += "@"
                continue
            else:
                break
        txt += nxt
    if nxt2 == "\n":
        m = hashlib.md5()
        m.update(polybytes(txt))
        outputf.write(polybytes("@%s\n@\n" % m.hexdigest()))
    else:
        sys.stderr.write(
            "cvsstrip: fatal error, @ trailer not followed by newline (%s).\n" % nxt2
        )
        sys.exit(1)


def skeletonize(inputf, outputf):
    "Skeletonize a CVS master, discarding content but leaving metadata."
    state = "ini"
    last_version = None
    deltacount = 0
    lineno = 0
    while True:
        lineno += 1
        line = polystr(inputf.readline())
        if not line:
            break
        if verbose > 1:
            sys.stderr.write(b"%s: %s\n" % (state, line.strip()))
        if state == "ini":
            if line.startswith("symbols"):
                state = "sym"
            elif line[0].isdigit():
                last_version = line.strip()
            elif line.startswith("log"):
                if strip_logs:
                    outputf.write(polybytes(line))
                    hash_escaped_text(inputf, outputf)
                    continue
            elif line.startswith("text"):
                if strip_content:
                    outputf.write(polybytes(line))
                    txt = "%s content for %s\n" % (inputf.name, last_version)
                    if deltacount > 0:
                        txt = "d1 1\na1 1\n" + txt
                    deltacount += 1
                    replace_escaped_text(inputf, txt, outputf)
                    continue
        elif state == "sym":
            if not line[0] in (" ", "\t") or line.strip() == ";":
                state = "ini"
            elif strip_tags and "0" not in line.split(":")[1]:
                if line.endswith(";\n"):
                    outputf.write(polybytes("\t;\n"))
                continue
        outputf.write(polybytes(line))


class Fatal(Exception):
    "Unrecoverable error."

    def __init__(self, msg):
        Exception.__init__(self)
        self.msg = msg


DEBUG_COMMANDS = 1


def debug_enable(level):
    "Hook for debug filtering."
    return verbose >= level


def announce(lvl, msg):
    if debug_enable(lvl):
        sys.stdout.write("reposurgeon: %s\n" % msg)


def rfc3339(t):
    "RFC3339 string from Unix time."
    return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(t))


try:
    _monotonic = time.monotonic
except AttributeError:  # Python 2
    _monotonic = time.time


def do_or_die(dcmd, legend=""):
    "Either execute a command or raise a fatal exception."
    if legend:
        legend = " " + legend
    announce(DEBUG_COMMANDS, "executing '%s'%s" % (dcmd, legend))
    try:
        retcode = subprocess.call(dcmd, shell=True, stderr=sys.stderr)
        if retcode < 0:
            raise Fatal("child was terminated by signal %d." % -retcode)
        if retcode != 0:
            raise Fatal("child returned %d." % retcode)
    except (OSError, IOError) as e:
        raise Fatal("execution of %s%s failed: %s" % (dcmd, legend, e))


class popen_or_die:
    "Read or write from a subordinate process."

    # The immediate parameter, if True, causes the process to be
    # immediately run and the pipe to it opened, instead of waiting
    # for the context manager protocol; for this use case, some other
    # context manager (e.g., LineParse) must explicitly close this
    # object to ensure that the subprocess shuts down cleanly
    def __init__(
        self, command, legend="", mode="r", errhandler=None, stderrcapture=False
    ):
        assert mode in ("r", "w")
        self.command = command
        self.legend = legend
        self.mode = mode
        self.errhandler = errhandler
        self.stdin = subprocess.PIPE if mode == "w" else None
        self.stdout = subprocess.PIPE if mode == "r" else None
        self.stderr = subprocess.STDOUT if (stderrcapture and mode == "r") else None
        if self.legend:
            self.legend = " " + self.legend
        self.fp = None

    def open(self):
        if debug_enable(DEBUG_COMMANDS):
            if self.mode == "r":
                sys.stderr.write(
                    "%s: reading from '%s'%s\n"
                    % (rfc3339(time.time()), self.command, self.legend)
                )
            else:
                sys.stderr.write(
                    "%s: writing to '%s'%s\n"
                    % (rfc3339(time.time()), self.command, self.legend)
                )
        try:
            # NOTE: the I/O streams for the subprocess are always
            # bytes; this is what we want for some operations, but we
            # will need to decode to Unicode for others to work in
            # Python 3; the polystr and make_wrapper functions handle
            # this
            self.fp = subprocess.Popen(
                self.command,
                shell=True,
                stdin=self.stdin,
                stdout=self.stdout,
                stderr=self.stderr,
            )
            # The Python documentation recommends using communicate()
            # to avoid deadlocks, but this doesn't allow fine control
            # over reading the data; since we are not trying to both
            # read from and write to the same process, this should be
            # OK
            return self.fp.stdout if self.mode == "r" else self.fp.stdin
        except (OSError, IOError) as oe:
            raise Fatal(
                "execution of %s%s failed: %s" % (self.command, self.legend, oe)
            )

    def close(self):
        if self.fp.stdin is not None:
            self.fp.stdin.close()
        if self.fp.stdout is not None:
            # This avoids a deadlock in wait() below if the OS pipe
            # buffer was filled because we didn't read all of the data
            # before exiting the context mgr (shouldn't happen but
            # this makes sure)
            self.fp.stdout.read()
        self.fp.wait()
        if self.fp.returncode != 0:
            if self.errhandler is None or not self.errhandler(self.fp.returncode):
                raise Fatal("%s%s returned error." % (self.command, self.legend))
        self.fp = None

    def __enter__(self):
        return self.open()

    def __exit__(self, extype, value, traceback_unused):
        if extype:
            if verbose:
                raise extype(value)
            raise Fatal("fatal exception in popen_or_die.")
        self.close()
        return False


def subset_test(paths, timeout):
    "Run cvs-fast-export on a path list with an optional timeout."
    start = _monotonic()
    timed_out = False
    devnull = open(os.devnull, "wb")
    try:
        print("Testing with %s elements" % len(paths))
        proc = subprocess.Popen(
            "cvs-fast-export",
            shell=True,
            stdin=subprocess.PIPE,
            stdout=devnull,
            stderr=devnull,
        )
        for path in paths:
            proc.stdin.write(polybytes(path))
            proc.stdin.write(b"\n")
        proc.stdin.close()

        if timeout == 0:
            proc.wait()
        else:
            while True:
                rc = proc.poll()
                if rc is not None:
                    break
                if _monotonic() - start >= timeout:
                    timed_out = True
                    proc.terminate()
                    deadline = _monotonic() + 1.0
                    while _monotonic() < deadline:
                        if proc.poll() is not None:
                            break
                        time.sleep(0.05)
                    if proc.poll() is None:
                        proc.kill()
                    break
                time.sleep(0.05)
        if proc.returncode is None:
            proc.wait()
        elapsed = _monotonic() - start
        if timed_out:
            print("Hung.")
        elif proc.returncode == 0:
            print("Succeeded.")
        else:
            print("Failed (rc=%s) after %d seconds" % (proc.returncode, elapsed))
        return (proc.returncode, elapsed, timed_out)
    finally:
        devnull.close()


def reaper(dirs):
    """
    Given a directory, prune it to the smallest subset of paths in it that produces a hang
    or crash (nonzero exit status) from cvs-fast-export. Uses subset_test().

    Asumes that hangs can be detected because reducing thge number of paths in a
    subset never increases the processing time.
    """
    if isinstance(dirs, (list, tuple)):
        paths = listing(dirs)
        rootdirs = list(dirs)
    else:
        paths = listing([dirs])
        rootdirs = [dirs]
    paths.sort()
    if not paths:
        print("No paths found.")
        return

    timeout = int(os.environ.get("CVSSTRIP_TIMEOUT", "60"))

    print("Testing %d paths with timeout=%d..." % (len(paths), timeout))
    (rc, _elapsed, timed_out) = subset_test(paths, timeout)
    if rc == 0 and not timed_out:
        print("No failure detected; nothing to winnow.")
        return

    def fails(subset):
        (rc, _elapsed, timed_out) = subset_test(subset, timeout)
        return rc != 0 or timed_out

    while len(paths) >= 2:
        chunk = (len(paths) + 1) // 2
        reduced = False

        for i in range(0, len(paths), chunk):
            sub = paths[i : i + chunk]
            print("Testing subset %d..%d (%d paths)..." % (i, i + len(sub), len(sub)))
            if fails(sub):
                paths = sub
                reduced = True
                break
        if not reduced:
            break

    keep = set(paths)
    removed = 0
    for root, _dirs, files in os.walk(rootdirs[0]):
        for name in files:
            fpath = os.path.join(root, name)
            if fpath not in keep:
                os.remove(fpath)
                removed += 1
    # Remove empty directories (bottom-up), but keep the root.
    for root, dirs, files in os.walk(rootdirs[0], topdown=False):
        if root == rootdirs[0]:
            continue
        if not dirs and not files:
            os.rmdir(root)

    print("Winnow complete: kept %d paths, removed %d." % (len(keep), removed))


def listing(dirs):
    "Get a listing of all directories in a specified selection."
    file_list = []
    for directory in dirs:
        if os.path.isdir(directory):
            for root, _, files in os.walk(directory):
                for file in files:
                    file_list.append(os.path.join(root, file))
    return file_list


class StripDebug(cmd.Cmd):
    "Interactive strip debugger"
    intro = "Welcome to the strip debugger. Ctrl-D to exit, ? for a command summary."
    prompt = "> "

    def __init__(self):
        cmd.Cmd.__init__(self)
        self.pwd = os.getcwd()
        self.selection = "."

    def emptyline(self):
        pass

    def help_cd(self):
        sys.stdout.write(
            """
Change directory to point to a tree of CVS masters.

Ad the end of the interpreter run, the original starting
directory will be restored.
"""
        )

    def do_cd(self, line):
        os.chdir(line)

    def help_EOF(self):
        sys.stdout.write(
            """
Exit the program cleanly, emitting a goodbye message.

Typing EOT (usually Ctrl-D) will exit quietly.
"""
        )

    def do_EOF(self, _line):
        os.chdir(self.pwd)
        print("\nGoodbye.")
        sys.exit(0)

    def do_shell(self, line):
        "Execute a shell command."
        sys.stdout.flush()
        sys.stderr.flush()
        if os.system(line):
            print(f"'shell {line}' returned error.")


if __name__ == "__main__":
    (opts, arguments) = getopt.getopt(sys.argv[1:], "cilo:tvw")
    outdir = None
    winnow = False
    for opt, arg in opts:
        if opt == "-c":
            strip_content = False
        elif opt == "-i":
            StripDebug().cmdloop()
            raise SystemExit(0)
        elif opt == "-l":
            strip_logs = False
        elif opt == "-o":
            outdir = arg
        elif opt == "-t":
            strip_tags = False
        elif opt == "-v":
            verbose += 1
        elif opt == "-w":
            winnow = True

    if not arguments:
        skeletonize(sys.stdin, sys.stdout)
        sys.exit(0)
    elif not os.path.isdir(arguments[0]):
        sys.stderr.write("cvsstrip: argument must be a directory.\n")
        sys.exit(1)

    originals = arguments[0]
    if not outdir:
        outdir = originals + "-reduced"
    if os.path.exists(outdir):
        sys.stderr.write("cvsstrip: refusing to step on %s.\n" % outdir)
        sys.exit(1)

    if winnow:
        reaper(originals)
        raise SystemExit(0)

    # Directory traversal
    for dirName, subdirList, fileList in os.walk(originals):
        path_parts = list(dirName.split(os.sep))
        path_parts.pop(0)
        newparts = [outdir] + path_parts
        for i in range(len(newparts)):
            newdir = os.path.join(*newparts[: i + 1])
            if not os.path.exists(newdir):
                if verbose:
                    print("Directory creation: %s" % newdir)
                os.mkdir(newdir)
        for fname in fileList:
            oldname = os.path.join(dirName, fname)
            newpath = newparts + [fname]
            newname = os.path.join(*newpath)
            if verbose > 0:
                print("%s -> %s" % (oldname, newname))
            if oldname.endswith(",v"):
                old = open(oldname, "rb")
                new = open(newname, "wb")
                skeletonize(old, new)
                old.close()
                new.close()
            else:
                sys.stderr.write("cvsstrip: %s isn't a CVS master.\n" % oldname)
                shutil.copyfile(oldname, newname)

# end
