CHips L MINI SHELL

CHips L pro

Current Path : /usr/lib/python2.6/site-packages/pyzor-1.0.0-py2.6.egg/pyzor/
Upload File :
Current File : //usr/lib/python2.6/site-packages/pyzor-1.0.0-py2.6.egg/pyzor/digest.py

"""Handle digesting the messages."""

from __future__ import print_function

import re
import hashlib

try:
    import HTMLParser
except ImportError:
    import html.parser as HTMLParser

# Hard-coded for the moment.
digest_spec = ([(20, 3), (60, 3)])

HASH = hashlib.sha1
HASH_SIZE = len(HASH(b"").hexdigest())


class HTMLStripper(HTMLParser.HTMLParser):
    """Strip all tags from the HTML."""
    def __init__(self, collector):
        HTMLParser.HTMLParser.__init__(self)
        self.reset()
        self.collector = collector
        self.collect = True

    def handle_data(self, data):
        """Keep track of the data."""
        data = data.strip()
        if data and self.collect:
            self.collector.append(data)

    def handle_starttag(self, tag, attrs):
        HTMLParser.HTMLParser.handle_starttag(self, tag, attrs)
        if tag.lower() in ("script", "style"):
            self.collect = False

    def handle_endtag(self, tag):
        HTMLParser.HTMLParser.handle_endtag(self, tag)
        if tag.lower() in ("script", "style"):
            self.collect = True


class DataDigester(object):
    """The major workhouse class."""
    __slots__ = ['value', 'digest']

    # Minimum line length for it to be included as part of the digest.
    min_line_length = 8

    # If a message is this many lines or less, then we digest the whole
    # message.
    atomic_num_lines = 4

    # We're not going to try to match email addresses as per the spec
    # because it's too difficult.  Plus, regular expressions don't work well
    # for them. (BNF is better at balanced parens and such).
    email_ptrn = re.compile(r'\S+@\S+')

    # Same goes for URLs.
    url_ptrn = re.compile(r'[a-z]+:\S+', re.IGNORECASE)

    # We also want to remove anything that is so long it looks like possibly
    # a unique identifier.
    longstr_ptrn = re.compile(r'\S{10,}')

    ws_ptrn = re.compile(r'\s')

    # String that the above patterns will be replaced with.
    # Note that an empty string will always be used to remove whitespace.
    unwanted_txt_repl = ''

    def __init__(self, msg, spec=None):
        if spec is None:
            spec = digest_spec
        self.value = None
        self.digest = HASH()

        # Need to know the total number of lines in the content.
        lines = []
        for payload in self.digest_payloads(msg):
            for line in payload.splitlines():
                norm = self.normalize(line)
                if self.should_handle_line(norm):
                    try:
                        lines.append(norm.encode("utf8", "ignore"))
                    except UnicodeError:
                        continue

        if len(lines) <= self.atomic_num_lines:
            self.handle_atomic(lines)
        else:
            self.handle_pieced(lines, spec)

        self.value = self.digest.hexdigest()

        assert len(self.value) == HASH_SIZE

    def handle_atomic(self, lines):
        """We digest everything."""
        for line in lines:
            self.handle_line(line)

    def handle_pieced(self, lines, spec):
        """Digest stuff according to the spec."""
        for offset, length in spec:
            for i in xrange(length):
                try:
                    line = lines[int(offset * len(lines) // 100) + i]
                except IndexError:
                    pass
                else:
                    self.handle_line(line)

    def handle_line(self, line):
        self.digest.update(line.rstrip())

    @classmethod
    def normalize(cls, s):
        repl = cls.unwanted_txt_repl
        s = cls.longstr_ptrn.sub(repl, s)
        s = cls.email_ptrn.sub(repl, s)
        s = cls.url_ptrn.sub(repl, s)
        # Make sure we do the whitespace last because some of the previous
        # patterns rely on whitespace.
        return cls.ws_ptrn.sub('', s).strip()

    @staticmethod
    def normalize_html_part(s):
        data = []
        stripper = HTMLStripper(data)
        try:
            stripper.feed(s)
        except (UnicodeDecodeError, HTMLParser.HTMLParseError):
            # We can't parse the HTML, so just strip it.  This is still
            # better than including generic HTML/CSS text.
            pass
        return " ".join(data)

    @classmethod
    def should_handle_line(cls, s):
        return len(s) and cls.min_line_length <= len(s)

    @classmethod
    def digest_payloads(cls, msg):
        for part in msg.walk():
            if part.get_content_maintype() == "text":
                payload = part.get_payload(decode=True)

                charset = part.get_content_charset()
                errors = "ignore"
                if not charset:
                    charset = "ascii"
                elif (charset.lower().replace("_", "-") in ("quopri-codec",
                      "quopri", "quoted-printable", "quotedprintable")):
                    errors = "strict"

                try:
                    payload = payload.decode(charset, errors)
                except (LookupError, UnicodeError, AssertionError):
                    try:
                        payload = payload.decode("ascii", "ignore")
                    except UnicodeError:
                        continue
                if part.get_content_subtype() == "html":
                    yield cls.normalize_html_part(payload)
                else:
                    yield payload
            elif part.is_multipart():
                # Skip, because walk() will give us the payload next.
                pass
            else:
                # Non-text parts are passed through as-is.
                yield part.get_payload()


class PrintingDataDigester(DataDigester):
    """Extends DataDigester: prints out what we're digesting."""
    def handle_line(self, line):
        print(line.decode("utf8"))
        super(PrintingDataDigester, self).handle_line(line)

Copyright 2K16 - 2K18 Indonesian Hacker Rulez