#!/usr/bin/python

#    Copyright (C) 2000  Bastian Kleineidam
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

# imports and checks
import sys
if sys.version[:5] < "1.5.2":
    raise SystemExit, "This program requires Python 1.5.2 or later."
import getopt,re,string,os,urlparse
# 90 seconds timeout for all connections
#import timeoutsocket
#timeoutsocket.setDefaultSocketTimeout(90)
import linkcheck, StringUtil
from linkcheck import _


Usage = _("USAGE\tlinkchecker [options] file-or-url...\n"
"\n"
"OPTIONS\n"
"For single-letter option arguments the space is not a necessity. So\n"
"'-o colored' is the same as '-ocolored'.\n"
"-a, --anchors\n"
"        Check anchor references. Default is don't check anchors.\n"
"-d, --denyallow\n"
"        Swap checking order to extern/intern. Default checking order\n"
"        is intern/extern.\n"
"-D, --debug\n"
"        Print additional debugging information.\n"
"-e regex, --extern=regex\n"
"        Assume urls that match the given expression as extern.\n"
"        Only intern HTML links are checked recursively.\n"
"-f file, --config=file\n"
"        Use file as configuration file. LinkChecker first searches\n"
"        ~/.linkcheckerrc and then /etc/linkcheckerrc\n"
"        (under Windows <path-to-program>\\linkcheckerrc).\n"
"-F type, --file-output=type\n"
"        Same as output, but write to a file linkchecker-out.<type>.\n"
"        If the file already exists, it is overwritten. You can specify\n"
"        this option more than once. There is no file output for the\n"
"        blacklist logger. Default is no file output.\n"
"-i regex, --intern=regex\n"
"        Assume URLs that match the given expression as intern.\n"
"        LinkChecker descends recursively only to intern URLs, not to extern.\n"
"-h, --help\n"
"        Help me! Print usage information for this program.\n"
"-N server, --nntp-server=server\n"
"        Specify an NNTP server for 'news:...' links. Default is the\n"
"        environment variable NNTP_SERVER. If no host is given,\n"
"        only the syntax of the link is checked.\n"
"-o type, --output=type\n"
"        Specify output type as %s.\n"
"        Default type is text.\n"
"-p pwd, --password=pwd\n"
"        Try password pwd for HTML and FTP authorization.\n"
"        Default password is 'joe@'. See also -u.\n"
"-q, --quiet\n"
"        Quiet operation. This is only useful with -F.\n"
"-r depth, --recursion-level=depth\n"
"        Check recursively all links up to given depth (depth >= 0).\n"
"        Default depth is 1.\n"
"-R, --robots-txt\n"
"        Obey the robots exclusion standard.\n"
"-s, --strict\n"
"        Check only syntax of extern links, do not try to connect to them.\n"
"-t num, --threads=num\n"
"        Generate no more than num threads. Default number of threads is 5.\n"
"        To disable threading specify a non-positive number.\n"
"-u name, --user=name\n"
"        Try username name for HTML and FTP authorization.\n"
"        Default is 'anonymous'. See also -p.\n"
"-V, --version\n"
"        Print version and exit.\n"
"-v, --verbose\n"
"        Log all checked URLs (implies -w). Default is to log only invalid\n"
"        URLs.\n"
"-w, --warnings\n"
"        Log warnings.\n"
"-W regex, --warning-regex=regex\n"
"        Define a regular expression which prints a warning if it matches\n"
"        any content of the checked link.\n"
"        This applies of course only to pages which are valid, so we can\n"
"        get their content.\n"
"        Use this to check for pages that contain some form of error\n"
"        message, for example 'This page has moved' or 'Oracle\n"
"        Application Server error'.\n"
"        This option implies -w.\n") % linkcheck.Config.LoggerKeys

Notes = _("NOTES\n"
"o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL\n"
"  starts with 'www.' resp. 'ftp.'\n"
"  You can also give local files as arguments.\n"
"o If you have your system configured to automatically establish a\n"
"  connection to the internet (e.g. with diald), it will connect when\n"
"  checking links not pointing to your local host.\n"
"  Use the -s and -i options to prevent this.\n"
"o Javascript links are currently ignored.\n"
"o If your platform does not support threading, LinkChecker uses -t0.\n"
"o You can supply multiple user/password pairs in a configuration file.\n"
"o Cookies are not accepted by LinkChecker.\n"
"o To use proxies set $http_proxy, $https_proxy on Unix or Windows.\n"
"  On a Mac use the Internet Config.\n"
"o When checking 'news:' links the given NNTP host doesn't need to be the\n"
"  same as the host of the user browsing your pages!\n")

Examples = _("EXAMPLES\n"
"o linkchecker -v -ohtml -r2 -s -itreasure.calvinsplayground.de \\\n"
"    http://treasure.calvinsplayground.de/~calvin/ > sample.html\n"
"o Local files and syntactic sugar on the command line:\n"
"      linkchecker c:\\temp\\test.html\n"
"      linkchecker ../bla.html\n"
"      linkchecker www.myhomepage.de\n"
"      linkchecker -r0 ftp.linux.org\n")

def printVersion():
    print linkcheck.Config.AppInfo
    sys.exit(0)
  
def printHelp():
    if os.name!='posix':
        StringUtil.paginate(Usage+"\n"+Notes+"\n"+Examples)
    else:
        print Usage
	print Notes
	print Examples
    sys.exit(0)
    
def printUsage(msg):
    sys.stderr.write(_("Error: %s\n") % msg)
    sys.stderr.write(_("Execute 'linkchecker -h' for help\n"))
    sys.exit(1)


# Read command line arguments
try:
    # Note: cut out the name of the script
    options, args = getopt.getopt(sys.argv[1:],
    "adDe:f:F:hi:N:o:p:qr:Rst:u:VvwW:", # short options
    ["anchors",                       # long options
    "config=",
    "debug", 
    "extern=",
    "file-output=",
    "nntp-server=",
    "help",
    "intern=",
    "denyallow",
    "output=",
    "password=",
    "quiet",
    "recursion-level=",
    "wischiwaschi",
    "robots-txt",
    "strict",
    "threads=",
    "user=",
    "version",
    "verbose",
    "warnings",
    "warning-regex="])
except getopt.error:
    type, value = sys.exc_info()[:2]
    printUsage(value)

# apply configuration
config = linkcheck.Config.Configuration()
configfiles = []
for opt,arg in options:
    if opt=="-f" or opt=="--config":
        configfiles.append(arg)
    elif opt=="-D" or opt=="--debug":
        linkcheck.Config.DebugFlag = 1
        config.disableThreading()
config.read(configfiles)

# apply options and arguments
_user = "anonymous"
_password = "guest@"
constructauth = 0
for opt,arg in options:
    if opt=="-a" or opt=="--anchors":
        config["anchors"] = 1

    elif opt=="-e" or opt=="--extern":
        config["externlinks"].append((re.compile(arg), 0))
    
    elif opt=="-h" or opt=="--help":
        printHelp()
    
    elif opt=="-o" or opt=="--output":
        if linkcheck.Config.Loggers.has_key(arg):
            config['log'] = config.newLogger(arg)
        else:
            printUsage((_("Illegal argument '%s' for option ") % arg) +\
	               "'-o, --output'")

    elif opt=="-F" or opt=="--file-output":
        if linkcheck.Config.Loggers.has_key(arg) and arg != "blacklist":
            config['fileoutput'].append(
	        config.newLogger(arg, {'fileoutput':1}))
        else:
            printUsage((_("Illegal argument '%s' for option ") % arg) +\
	               "'-F, --file-output'")
    
    elif opt=="-i" or opt=="--intern":
        config["internlinks"].append(re.compile(arg))
    
    elif opt=="-l" or opt=="--denyallow":
        config["denyallow"] = 1
    
    elif opt=="-N" or opt=="--nntp-server":
        config["nntpserver"] = arg
        
    elif opt=="-p" or opt=="--password":
        _password = arg
        constructauth = 1
        
    elif opt=="-q" or opt=="--quiet":
    	config["quiet"] = 1
    
    elif opt=="-r" or opt=="--recursion-level":
        if int(arg) >= 0:
            config["recursionlevel"] = int(arg)
        else:
            printUsage((_("Illegal argument '%s' for option ") % arg) +
	               "'-r, --recursion-level'")
    
    elif opt=="-R" or opt=="--robots-txt":
        config["robotstxt"] = 1
    
    elif opt=="-s" or opt=="--strict":
        config["strict"] = 1
    
    elif opt=="-t" or opt=="--threads":
        num = int(arg)
        if config["threads"] and not linkcheck.Config.DebugFlag:
            if num>0:
                config.enableThreading(num)
            else:
                config.disableThreading()
    
    elif opt=="-u" or opt=="--user":
        _user = arg
        constructauth = 1
    
    elif opt=="-V" or opt=="--version":
        printVersion()

    elif opt=="-v" or opt=="--verbose":
        config["verbose"] = 1
        config["warnings"] = 1

    elif opt=="--wischiwaschi":
        import util1
        util1.abbuzze()
        sys.exit(0)
    elif opt=="-w" or opt=="--warnings":
        config["warnings"] = 1

    elif opt=="-W" or opt=="--warning-regex":
        config["warningregex"] = re.compile(arg)
        config["warnings"] = 1

if constructauth:
    config["authentication"].insert(0, (re.compile(".*"), _user, _password))

# construct the url list
# if we use blacklist mode, try to read ~/.blacklist
if config["log"].__class__ == linkcheck.Logging.BlacklistLogger and \
   os.path.exists(config['log'].filename):
    args = open(config['log'].filename).readlines()

if len(args)==0:
    print _("warning: no files or urls given")

for url in args:
    url = string.strip(url)
    if not (":" in url):
        if re.compile("^ftp\.").match(url):
            url = "ftp://"+url
        elif re.compile("^www\.").match(url):
            url = "http://"+url
    config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))

# check the urls
linkcheck.checkUrls(config)
