Source code for pyrocore.util.metafile

# -*- coding: utf-8 -*-
# pylint: disable=
""" Metafile Support.

    Copyright (c) 2009, 2010, 2011 The PyroScope Project <pyroscope.project@gmail.com>
"""
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
from __future__ import with_statement
from __future__ import absolute_import
from __future__ import unicode_literals

import re
import sys
import time
import stat
import math
import errno
import pprint
import fnmatch
import hashlib
import urlparse

from pyrobase import bencode
from pyrobase.parts import Bunch
from pyrocore import config, error
from pyrocore.util import os, fmt, pymagic


# Allowed characters in a metafile filename or path
ALLOWED_ROOT_NAME = re.compile(r"^[^/\\.~][^/\\]*$") # cannot be absolute or ~user, and cannot have path parts
ALLOWED_PATH_NAME = re.compile(r"^(?:~\d+)?[^/\\~][^/\\]*$")

# Character sequences considered secret (roughly, any path part or query parameter
# that looks like an alphanumeric sequence or url-safe base64 string)
PASSKEY_RE = re.compile(r"(?<=[/=])[-_0-9a-zA-Z]{5,64}={0,3}(?=[/&]|$)")

# Non-secret exemptions
PASSKEY_OK = ("announce", "TrackerServlet",)

# List of all standard keys in a metafile
METAFILE_STD_KEYS = [_i.split('.') for _i in (
    "announce",
    "announce-list", # BEP-0012
    "comment",
    "created by",
    "creation date",
    "encoding",
    "info",
    "info.length",
    "info.name",
    "info.piece length",
    "info.pieces",
    "info.private",
    "info.files",
    "info.files.length",
    "info.files.path",
)]

del _i  # pylint: disable=undefined-loop-variable


[docs]def console_progress():
    """ Return a progress indicator for consoles if
        stdout is a tty.
    """
    def progress(totalhashed, totalsize):
        "Helper"
        msg = " " * 30
        if totalhashed < totalsize:
            msg = "%5.1f%% complete" % (totalhashed * 100.0 / totalsize)
        sys.stdout.write(msg + " \r")
        sys.stdout.flush()

    try:
        return progress if sys.stdout.isatty() else None
    except AttributeError:
        return None


[docs]def mask_keys(announce_url):
    """ Mask any passkeys (hex sequences) in an announce URL.
    """
    return PASSKEY_RE.sub(
        lambda m: m.group() if m.group() in PASSKEY_OK else "*" * len(m.group()),
        announce_url)


[docs]class MaskingPrettyPrinter(pprint.PrettyPrinter):
    """ A PrettyPrinter that masks strings in the object tree.
    """

[docs]    def format(self, obj, context, maxlevels, level):  # pylint: disable=arguments-differ
        """ Mask obj if it looks like an URL, then pass it to the super class.
        """
        if isinstance(obj, basestring) and "://" in fmt.to_unicode(obj):
            obj = mask_keys(obj)
        return pprint.PrettyPrinter.format(self, obj, context, maxlevels, level)


[docs]def check_info(info):
    """ Validate info dict.

        Raise ValueError if validation fails.
    """
    if not isinstance(info, dict):
        raise ValueError("bad metainfo - not a dictionary")

    pieces = info.get("pieces")
    if not isinstance(pieces, basestring) or len(pieces) % 20 != 0:
        raise ValueError("bad metainfo - bad pieces key")

    piece_size = info.get("piece length")
    if not isinstance(piece_size, (int, long)) or piece_size <= 0:
        raise ValueError("bad metainfo - illegal piece length")

    name = info.get("name")
    if not isinstance(name, basestring):
        raise ValueError("bad metainfo - bad name (type is %r)" % type(name).__name__)
    if not ALLOWED_ROOT_NAME.match(name):
        raise ValueError("name %s disallowed for security reasons" % name)

    if ("files" in info) == ("length" in info):
        raise ValueError("single/multiple file mix")

    if "length" in info:
        length = info.get("length")
        if not isinstance(length, (int, long)) or length < 0:
            raise ValueError("bad metainfo - bad length")
    else:
        files = info.get("files")
        if not isinstance(files, (list, tuple)):
            raise ValueError("bad metainfo - bad file list")

        for item in files:
            if not isinstance(item, dict):
                raise ValueError("bad metainfo - bad file value")

            length = item.get("length")
            if not isinstance(length, (int, long)) or length < 0:
                raise ValueError("bad metainfo - bad length")

            path = item.get("path")
            if not isinstance(path, (list, tuple)) or not path:
                raise ValueError("bad metainfo - bad path")

            for part in path:
                if not isinstance(part, basestring):
                    raise ValueError("bad metainfo - bad path dir")
                part = fmt.to_unicode(part)
                if part == '..':
                    raise ValueError("relative path in %s disallowed for security reasons" % '/'.join(path))
                if part and not ALLOWED_PATH_NAME.match(part):
                    raise ValueError("path %s disallowed for security reasons" % part)

        file_paths = [os.sep.join(item["path"]) for item in files]
        if len(set(file_paths)) != len(file_paths):
            raise ValueError("bad metainfo - duplicate path")

    return info


[docs]def check_meta(meta):
    """ Validate meta dict.

        Raise ValueError if validation fails.
    """
    if not isinstance(meta, dict):
        raise ValueError("bad metadata - not a dictionary")
    if not isinstance(meta.get("announce"), basestring):
        raise ValueError("bad announce URL - not a string")
    check_info(meta.get("info"))

    return meta


[docs]def clean_meta(meta, including_info=False, logger=None):
    """ Clean meta dict. Optionally log changes using the given logger.

        @param logger: If given, a callable accepting a string message.
        @return: Set of keys removed from C{meta}.
    """
    modified = set()

    for key in meta.keys():
        if [key] not in METAFILE_STD_KEYS:
            if logger:
                logger("Removing key %r..." % (key,))
            del meta[key]
            modified.add(key)

    if including_info:
        for key in meta["info"].keys():
            if ["info", key] not in METAFILE_STD_KEYS:
                if logger:
                    logger("Removing key %r..." % ("info." + key,))
                del meta["info"][key]
                modified.add("info." + key)

        for idx, entry in enumerate(meta["info"].get("files", [])):
            for key in entry.keys():
                if ["info", "files", key] not in METAFILE_STD_KEYS:
                    if logger:
                        logger("Removing key %r from file #%d..." % (key, idx + 1))
                    del entry[key]
                    modified.add("info.files." + key)

            # Remove crap that certain PHP software puts in paths
            entry["path"] = [i for i in entry["path"] if i]

    return modified


[docs]def sanitize(meta):
    """ Try to fix common problems, especially transcode non-standard string encodings.
    """
    def sane_encoding(text):
        "Transcoding helper."
        for encoding in ('utf-8', meta.get('encoding', None), 'cp1252'):
            if encoding:
                try:
                    return text.decode(encoding).encode("utf-8")
                except UnicodeError:
                    continue
        else:
            # Broken beyond anything reasonable
            return str(text, 'utf-8', 'replace').replace('\ufffd', '_').encode("utf-8")

    # Go through all string fields and check them
    for field in ("comment", "created by"):
        if field in meta:
            meta[field] = sane_encoding(meta[field])

    meta["info"]["name"] = sane_encoding(meta["info"]["name"])

    for entry in meta["info"].get("files", []):
        entry["path"] = [sane_encoding(i) for i in entry["path"]]

    return meta


[docs]def assign_fields(meta, assignments):
    """ Takes a list of C{key=value} strings and assigns them to the
        given metafile. If you want to set nested keys (e.g. "info.source"),
        you have to use a dot as a separator. For exotic keys *containing*
        a dot, double that dot ("dotted..key").

        Numeric values starting with "+" or "-" are converted to integers.

        If just a key name is given (no '='), the field is removed.
    """
    for assignment in assignments:
        try:
            if '=' in assignment:
                field, val = assignment.split('=', 1)
            else:
                field, val = assignment, None

            if val and val[0] in "+-" and val[1:].isdigit():
                val = int(val, 10)

            # TODO: Allow numerical indices, and "+" for append
            namespace = meta
            keypath = [i.replace('\0', '.') for i in field.replace('..', '\0').split('.')]
            for key in keypath[:-1]:
                # Create missing dicts as we go...
                namespace = namespace.setdefault(key, {})
        except (KeyError, IndexError, TypeError, ValueError) as exc:
            raise error.UserError("Bad assignment %r (%s)!" % (assignment, exc))
        else:
            if val is None:
                del namespace[keypath[-1]]
            else:
                namespace[keypath[-1]] = val

    return meta


[docs]def add_fast_resume(meta, datapath):
    """ Add fast resume data to a metafile dict.
    """
    # Get list of files
    files = meta["info"].get("files", None)
    single = files is None
    if single:
        if os.path.isdir(datapath):
            datapath = os.path.join(datapath, meta["info"]["name"])
        files = [Bunch(
            path=[os.path.abspath(datapath)],
            length=meta["info"]["length"],
        )]

    # Prepare resume data
    resume = meta.setdefault("libtorrent_resume", {})
    resume["bitfield"] = len(meta["info"]["pieces"]) // 20
    resume["files"] = []
    piece_length = meta["info"]["piece length"]
    offset = 0

    for fileinfo in files:
        # Get the path into the filesystem
        filepath = os.sep.join(fileinfo["path"])
        if not single:
            filepath = os.path.join(datapath, filepath.strip(os.sep))

        # Check file size
        if os.path.getsize(filepath) != fileinfo["length"]:
            raise OSError(errno.EINVAL, "File size mismatch for %r [is %d, expected %d]" % (
                filepath, os.path.getsize(filepath), fileinfo["length"],
            ))

        # Add resume data for this file
        resume["files"].append(dict(
            priority=1,
            mtime=int(os.path.getmtime(filepath)),
            completed=(offset+fileinfo["length"]+piece_length-1) // piece_length
                     - offset // piece_length,
        ))
        offset += fileinfo["length"]

    return meta


[docs]def info_hash(metadata):
    """ Return info hash as a string.
    """
    return hashlib.sha1(bencode.bencode(metadata['info'])).hexdigest().upper()


[docs]def data_size(metadata):
    """ Calculate the size of a torrent based on parsed metadata.
    """
    info = metadata['info']

    if 'length' in info:
        # Single file
        total_size = info['length']
    else:
        # Directory structure
        total_size = sum([f['length'] for f in info['files']])

    return total_size


[docs]def checked_open(filename, log=None, quiet=False):
    """ Open and validate the given metafile.
        Optionally provide diagnostics on the passed logger, for
        invalid metafiles, which then just cause a warning but no exception.
        "quiet" can supress that warning.
    """
    with open(filename, "rb") as handle:
        raw_data = handle.read()
    data = bencode.bdecode(raw_data)

    try:
        check_meta(data)
        if raw_data != bencode.bencode(data):
            raise ValueError("Bad bencoded data - dict keys out of order?")
    except ValueError as exc:
        if log:
            # Warn about it, unless it's a quiet value query
            if not quiet:
                log.warn("%s: %s" % (filename, exc))
        else:
            raise

    return data


[docs]class Metafile(object):
    """ A torrent metafile.
    """

    # Patterns of names to ignore
    IGNORE_GLOB = [
        "core", "CVS", ".*", "*~", "*.swp", "*.tmp", "*.bak",
        "[Tt]humbs.db", "[Dd]esktop.ini", "ehthumbs_vista.db",
    ]


    def __init__(self, filename, datapath=None):
        """ Initialize metafile.
        """
        self.filename = filename
        self.progress = None
        self.datapath = datapath
        self.ignore = self.IGNORE_GLOB[:]
        self.LOG = pymagic.get_class_logger(self)


    def _get_datapath(self):
        """ Get a valid datapath, else raise an exception.
        """
        if self._datapath is None:
            raise OSError(errno.ENOENT, "You didn't provide any datapath for %r" % self.filename)

        return self._datapath

    def _set_datapath(self, datapath):
        """ Set a datapath.
        """
        if datapath:
            self._datapath = datapath.rstrip(os.sep)
            self._fifo = int(stat.S_ISFIFO(os.stat(self.datapath).st_mode))
        else:
            self._datapath = None
            self._fifo = False

    datapath = property(_get_datapath, _set_datapath)


[docs]    def walk(self):
        """ Generate paths in "self.datapath".
        """
        # FIFO?
        if self._fifo:
            if self._fifo > 1:
                raise RuntimeError("INTERNAL ERROR: FIFO read twice!")
            self._fifo += 1

            # Read paths relative to directory containing the FIFO
            with open(self.datapath, "r") as fifo:
                while True:
                    relpath = fifo.readline().rstrip('\n')
                    if not relpath: # EOF?
                        break
                    self.LOG.debug("Read relative path %r from FIFO..." % (relpath,))
                    yield os.path.join(os.path.dirname(self.datapath), relpath)

            self.LOG.debug("FIFO %r closed!" % (self.datapath,))

        # Directory?
        elif os.path.isdir(self.datapath):
            # Walk the directory tree
            for dirpath, dirnames, filenames in os.walk(self.datapath): #, followlinks=True):
                # Don't scan blacklisted directories
                for bad in dirnames[:]:
                    if any(fnmatch.fnmatch(bad, pattern) for pattern in self.ignore):
                        dirnames.remove(bad)

                # Yield all filenames that aren't blacklisted
                for filename in filenames:
                    if not any(fnmatch.fnmatch(filename, pattern) for pattern in self.ignore):
                        #yield os.path.join(dirpath[len(self.datapath)+1:], filename)
                        yield os.path.join(dirpath, filename)

        # Single file
        else:
            # Yield the filename
            yield self.datapath


    def _calc_size(self):
        """ Get total size of "self.datapath".
        """
        return sum(os.path.getsize(filename)
            for filename in self.walk()
        )


    def _make_info(self, piece_size, progress, walker, piece_callback=None):
        """ Create info dict.
        """
        # These collect the file descriptions and piece hashes
        file_list = []
        pieces = []

        # Initialize progress state
        hashing_secs = time.time()
        totalsize = -1 if self._fifo else self._calc_size()
        totalhashed = 0

        # Start a new piece
        sha1sum = hashlib.sha1()
        done = 0
        filename = None

        # Hash all files
        for filename in walker:
            # Assemble file info
            filesize = os.path.getsize(filename)
            filepath = filename[len(os.path.dirname(self.datapath) if self._fifo else self.datapath):].lstrip(os.sep)
            file_list.append({
                "length": filesize,
                "path": [fmt.to_utf8(x) for x in fmt.to_unicode(filepath).replace(os.sep, '/').split('/')],
            })
            self.LOG.debug("Hashing %r, size %d..." % (filename, filesize))

            # Open file and hash it
            fileoffset = 0
            handle = open(filename, "rb")
            try:
                while fileoffset < filesize:
                    # Read rest of piece or file, whatever is smaller
                    chunk = handle.read(min(filesize - fileoffset, piece_size - done))
                    sha1sum.update(chunk) # bogus pylint: disable=E1101
                    done += len(chunk)
                    fileoffset += len(chunk)
                    totalhashed += len(chunk)

                    # Piece is done
                    if done == piece_size:
                        pieces.append(sha1sum.digest()) # bogus pylint: disable=E1101
                        if piece_callback:
                            piece_callback(filename, pieces[-1])

                        # Start a new piece
                        sha1sum = hashlib.sha1()
                        done = 0

                    # Report progress
                    if progress:
                        progress(totalhashed, totalsize)
            finally:
                handle.close()

        # Add hash of partial last piece
        if done > 0:
            pieces.append(sha1sum.digest()) # bogus pylint: disable=E1103
            if piece_callback:
                piece_callback(filename, pieces[-1])

        # Build the meta dict
        metainfo = {
            "pieces": b"".join(pieces),
            "piece length": piece_size,
            "name": os.path.basename(self.datapath),
        }

        # Handle directory/FIFO vs. single file
        if self._fifo or os.path.isdir(self.datapath):
            metainfo["files"] = file_list
        else:
            metainfo["length"] = totalhashed

        hashing_secs = time.time() - hashing_secs
        self.LOG.info("Hashing of %s took %.1f secs (%s/s)" % (
            fmt.human_size(totalhashed).strip(), hashing_secs, fmt.human_size(totalhashed / hashing_secs).strip(),
        ))

        # Return validated info dict
        return check_info(metainfo), totalhashed


    def _make_meta(self, tracker_url, root_name, private, progress):
        """ Create torrent dict.
        """
        # Calculate piece size
        if self._fifo:
            # TODO we need to add a (command line) param, probably for total data size
            # for now, always 1MB
            piece_size_exp = 20
        else:
            total_size = self._calc_size()
            if total_size:
                piece_size_exp = int(math.log(total_size) / math.log(2)) - 9
            else:
                piece_size_exp = 0

        piece_size_exp = min(max(15, piece_size_exp), 24)
        piece_size = 2 ** piece_size_exp

        # Build info hash
        info, totalhashed = self._make_info(piece_size, progress, self.walk() if self._fifo else sorted(self.walk()))

        # Enforce unique hash per tracker
        info["x_cross_seed"] = hashlib.md5(tracker_url).hexdigest()

        # Set private flag
        if private:
            info["private"] = 1

        # Freely chosen root name (default is basename of the data path)
        if root_name:
            info["name"] = root_name

        # Torrent metadata
        meta = {
            "info": info,
            "announce": tracker_url.strip(),
        }

        #XXX meta["encoding"] = "UTF-8"

        # Return validated meta dict
        return check_meta(meta), totalhashed


[docs]    def create(self, datapath, tracker_urls, comment=None, root_name=None,
                     created_by=None, private=False, no_date=False, progress=None,
                     callback=None):
        """ Create a metafile with the path given on object creation.
            Returns the last metafile dict that was written (as an object, not bencoded).
        """
        if datapath:
            self.datapath = datapath

        try:
            tracker_urls = ['' + tracker_urls]
        except TypeError:
            tracker_urls = list(tracker_urls)
        multi_mode = len(tracker_urls) > 1

        # TODO add optimization so the hashing happens only once for multiple URLs!
        for tracker_url in tracker_urls:
            # Lookup announce URLs from config file
            try:
                if urlparse.urlparse(tracker_url).scheme:
                    tracker_alias = urlparse.urlparse(tracker_url).netloc.split(':')[0].split('.')
                    tracker_alias = tracker_alias[-2 if len(tracker_alias) > 1 else 0]
                else:
                    tracker_alias, tracker_url = config.lookup_announce_alias(tracker_url)
                    tracker_url = tracker_url[0]
            except (KeyError, IndexError):
                raise error.UserError("Bad tracker URL %r, or unknown alias!" % (tracker_url,))

            # Determine metafile name
            output_name = self.filename
            if multi_mode:
                # Add 2nd level of announce URL domain to metafile name
                output_name = list(os.path.splitext(output_name))
                try:
                    output_name[1:1] = '-' + tracker_alias
                except (IndexError,):
                    self.LOG.error("Malformed announce URL %r, skipping!" % (tracker_url,))
                    continue
                output_name = ''.join(output_name)

            # Hash the data
            self.LOG.info("Creating %r for %s %r..." % (
                output_name, "filenames read from" if self._fifo else "data in", self.datapath,
            ))
            meta, _ = self._make_meta(tracker_url, root_name, private, progress)

            # Add optional fields
            if comment:
                meta["comment"] = comment
            if created_by:
                meta["created by"] = created_by
            if not no_date:
                meta["creation date"] = int(time.time())
            if callback:
                callback(meta)

            # Write metafile to disk
            self.LOG.debug("Writing %r..." % (output_name,))
            bencode.bwrite(output_name, meta)

        return meta


[docs]    def check(self, metainfo, datapath, progress=None):
        """ Check piece hashes of a metafile against the given datapath.
        """
        if datapath:
            self.datapath = datapath

        def check_piece(filename, piece):
            "Callback for new piece"
            if piece != metainfo["info"]["pieces"][check_piece.piece_index:check_piece.piece_index+20]:
                self.LOG.warn("Piece #%d: Hashes differ in file %r" % (check_piece.piece_index//20, filename))
            check_piece.piece_index += 20
        check_piece.piece_index = 0

        datameta, _ = self._make_info(int(metainfo["info"]["piece length"]), progress,
            [datapath] if "length" in metainfo["info"] else
            (os.path.join(*([datapath] + i["path"])) for i in metainfo["info"]["files"]),
            piece_callback=check_piece
        )
        return datameta["pieces"] == metainfo["info"]["pieces"]


[docs]    def listing(self, masked=True):
        """ List torrent info & contents. Returns a list of formatted lines.
        """
        # Assemble data
        metainfo = sanitize(bencode.bread(self.filename))
        announce = metainfo['announce']
        info = metainfo['info']
        infohash = hashlib.sha1(bencode.bencode(info))

        total_size = data_size(metainfo)
        piece_length = info['piece length']
        piece_number, last_piece_length = divmod(total_size, piece_length)

        # Build result
        result = [
            "NAME %s" % (os.path.basename(fmt.to_unicode(self.filename))),
            "SIZE %s (%i * %s + %s)" % (
                fmt.human_size(total_size).strip(),
                piece_number, fmt.human_size(piece_length).strip(),
                fmt.human_size(last_piece_length).strip(),
            ),
            "META %s (pieces %s %.1f%%)" % (
                fmt.human_size(os.path.getsize(self.filename)).strip(),
                fmt.human_size(len(info["pieces"])).strip(),
                100.0 * len(info["pieces"]) / os.path.getsize(self.filename),
            ),
            "HASH %s" % (infohash.hexdigest().upper()),
            "URL  %s" % (mask_keys if masked else str)(announce),
            "PRV  %s" % ("YES (DHT/PEX disabled)" if info.get("private") else "NO (DHT/PEX enabled)"),
            "TIME %s" % ("N/A" if "creation date" not in metainfo else
                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(metainfo["creation date"]))
            ),
        ]

        for label, key in (("BY  ", "created by"), ("REM ", "comment")):
            if key in metainfo:
                result.append("%s %s" % (label, metainfo.get(key, "N/A")))

        result.extend([
            "",
            "FILE LISTING%s" % ("" if 'length' in info else " [%d file(s)]" % len(info['files']),),
        ])
        if 'length' in info:
            # Single file
            result.append("%-69s%9s" % (
                    fmt.to_unicode(info['name']),
                    fmt.human_size(total_size),
            ))
        else:
            # Directory structure
            result.append("%s/" % fmt.to_unicode(info['name']))
            oldpaths = [None] * 99
            for entry in info['files']:
                # Remove crap that certain PHP software puts in paths
                entry_path = [fmt.to_unicode(i) for i in entry["path"] if i]

                for idx, item in enumerate(entry_path[:-1]):
                    if item != oldpaths[idx]:
                        result.append("%s%s/" % (' ' * (4*(idx+1)), item))
                        oldpaths[idx] = item
                result.append("%-69s%9s" % (
                    ' ' * (4*len(entry_path)) + entry_path[-1],
                    fmt.human_size(entry['length']),
                ))

        return result