#!/usr/bin/env python
#
# filterator -- filter and format the output of comparator
#
import os, sys, re, getopt

# Template for parsing the output from comparator.
shredline = re.compile("([^:]*):([0-9]+):([0-9]+)")

# Tokens to be removed when checking whether a shred is significant
junk = map(re.compile, (
    # Idioms that don't convey any meaning in isolation
    r"return *\(?[a-z]+\)? *;", r"return *\(?-?[01]+\)? *;", r"goto +[a-z]+;",
    r"exit *\([01]\);",
    # Pragmas
    r'/\* *ARGSUSED *\*/', r'/\* *NOTREACHED *\*/', 
    # Bare C keywords
    r'\bbreak\b',  r'\bcase\b',r'\bcontinue\b', r'\bdefault\b',
    r'\bdo\b', r'\belse\b', r'\bif\b', r'\bgoto\b', r'\breturn\b',
    r'\bswitch\b', r'\bwhile\b',
    r'enum', r'\bint\b', r'\blong\b', r'\bshort\b', r'\bstatic\b',
    r'\bstruct\b', r'typedef', r'\bunion\b', r'\bvoid\b',
    r'# *define',r'# *endif',r'# *else',r'# *if\b',
    r'# *ifdef\b',r'# *ifndef\b', 
    r'/\*+', r'\*+/', r'\*+',
    # Common preprocessor macros, not significant by themselves.
    r'\bASSERT\b', r'\bFALSE\b', r'\bNULL\b', r'\bSTATIC\b', r'\bTRUE\b',
    # Macro include lines are noise, too.
    r'\s*#include.*',
    # Common error macros.
    r'\bEINVAL\b', r'\bENOSYS\b', 
    ))

def nontrivial(text):
    "Identify a shred as trivial or nontrivial."
    # Basic theory of this function is that if we throw out all C
    # syntax and and common constants, and there is still an
    # identifier, we're looking at something that might be
    # interesting.
    text = ' ' + text
    while True:
        savecopy = text
        for regexp in junk:
            text = regexp.sub(' ', text)
        for ch in ('{','}', '(',')', '<','>', '[',']',
                   ';', ':', ',', '.', '%', '^', '&', '|', '*','?',
                   '+', '-', '/', '=', '!', '\n','\t'):
            text = text.replace(ch, ' ')
        if savecopy == text:
            break
        else:
            continue
    return text.strip()

if __name__ == '__main__':
    try:
        (optlist, args) = getopt.getopt(sys.argv[1:], 'd:n')
    except getopt.GetoptError:
        sys.stderr.write("usage: filterator [-d dir] [-n]\n")
        sys.exit(2)
    nofilter = False
    for (opt, val) in optlist:
        if opt == '-d':
            os.chdir(val)
        elif opt == '-n':
            nofilter = True

    # Read the SCF header
    hash = "MD5"
    merge_program = None
    id = sys.stdin.readline()
    if not id.startswith("#SCF-B "):
        sys.stderr.write("filterator: input is not a SCF-B file.\n")
        sys.exit(1)
    while True:
        line = sys.stdin.readline()
        if not line or line == '%%\n':
            break
        (tag, value) = line.split(":")
        value = value.strip()
        if tag == "Normalization":
            normalization = value
        elif tag == "Shred-Size":
            shredsize = int(value)
        elif tag == "Merge-Program":
            merge_program = value
        elif tag == "Hash-Method":
            hash_method = value

    print "Filter-Program: filterator 1.0"
    print "Filtering:", ("C-syntax", "none")[nofilter]
    print "Hash-Method: MD5"
    if merge_program:
        print "Merge-Program:", merge_program
    print "Normalization:", normalization
    print "Shred-Size: %d" % shredsize
    print "%%"
    count = 0
    locations = []
    while True:
        line = sys.stdin.readline()
        if not line:
            break
        m = shredline.search(line)
        if m:
            locations.append((m.group(1), int(m.group(2)), int(m.group(3))))
        if line == '%%\n':
            for (file, start, end) in locations:
                try:
                    rfp = open(file)
                    for i in range(start-1):
                        rfp.readline()
                    text = ""
                    for i in range(start, end+1):
                        nextline = rfp.readline()
                        if nextline[0] == '%':
                            nextline = '%' + nextline
                        text += nextline
                    rfp.close()
                    if nofilter or nontrivial(text):
                        count += 1
                        print "%% %s:%s-%s: (%d matches)" % (file, start, end, len(locations))
                        sys.stdout.write(text)
                        #print `nontrivial(text)`
                    locations = []
                    break
                except IOError:
                    sys.stderr.write("filterator: can't open %s\n" % file)
                    sys.exit(0)
    print "%d overlaps." % count
