#!/usr/bin/env python

# file scripts/fedora-checksums
# 
#   Copyright 2012 Emory University Libraries
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.


# NOTE: more detailed documentation & usage examples are included in
# scripts/__init__.py for inclusion in sphinx docs.

import argparse
from collections import defaultdict
import csv
from eulfedora.server import Repository
from eulfedora.rdfns import model as modelns
from getpass import getpass
import os
import signal
import sys

try:
    from progressbar import ProgressBar, Bar, Percentage, ETA, SimpleProgress, Timer
except ImportError:
    ProgressBar = None


class FedoraChecksums(object):

    stats = defaultdict(int)

    csv_file = None
    csv = None

    # interrupt flag to exit the main processing loop when a signal is caught
    interrupted = False

    # URI for Fedora object content model
    object_model = 'info:fedora/fedora-system:FedoraObject-3.0'


    def config_arg_parser(self):
        # configure argument parser
        
        # common args for either mode
        common_args = argparse.ArgumentParser(add_help=False)
        # fedora connection options
        repo_args = common_args.add_argument_group('Fedora repository connection options')
        repo_args.add_argument('--fedora-root', dest='fedora_root', required=True,
                               help='URL for accessing fedora, e.g. http://localhost:8080/fedora/')
        repo_args.add_argument('--fedora-user', dest='fedora_user', default=None, 
                               help='Fedora username')
        repo_args.add_argument('--fedora-password', dest='fedora_password', metavar='PASSWORD',
                               default=None, action=PasswordAction,
                               help='Password for the specified Fedora user (leave blank to be prompted)')
        # general script options
        common_args.add_argument('--quiet', '-q', default=False, action='store_true',
                                 help='Quiet mode: only output summary report')
        common_args.add_argument('--max', '-m', type=int, metavar='N',
                                 help='Stop after processing the first %(metavar)s objects')

        common_args.add_argument('pids', metavar='PID', nargs='*',
                                 help='list specific pids to be checked (optional)')

        # actual main argument parser
        self.parser = argparse.ArgumentParser(description='''Validate or repair datastream
        checksums for Fedora repository content.  By default, iterates through all content
        objects that are findable via RIsearch and checks or repairs all datastreams.
        ''')

        # add subcommands for script modes
        subparsers = self.parser.add_subparsers(dest='mode', title='subcommands')
        # validate
        v_parser = subparsers.add_parser('validate',
                                         help='check for invalid and missing checksums',
                                         parents=[common_args])
        v_parser.add_argument('--csv-file', dest='csv_file', default=None,
                              help='Output results to the specified CSV file')
        v_parser.add_argument('--all-versions', '-a', dest='all_versions', action='store_true',
                              help='''Check all versions of datastreams
                              (by default, only current versions are checked)''')
        v_parser.add_argument('--missing-only', dest='missing_only', action='store_true',
                              help='''Only check for datastreams with no checksum''')
        # repair
        r_parser = subparsers.add_parser('repair',
                                         help='repair missing checksums',
                                         parents=[common_args])
        r_parser.add_argument('--checksum-type', dest='checksum_type', default='DEFAULT',
                              help='''Checksum type to use; if not specified,
                              will prompt Fedora to use the configured default checksum type''')


    def run(self):
        # bind a handler for interrupt signal
        signal.signal(signal.SIGINT, self.interrupt_handler)

        self.config_arg_parser()
        self.args = self.parser.parse_args()


        # if in validation mode and csv-file is specified, open the
        # file and write the header row
        if self.args.mode == 'validate' and self.args.csv_file:
            # TODO: error handling for file open/write failure
            self.csv_file = open(self.args.csv_file, 'wb')
            self.csv = csv.writer(self.csv_file,  quoting=csv.QUOTE_ALL)
            self.csv.writerow(['pid', 'datastream id', 'date created', 'status',
                               'mimetype', 'versioned'])

        # TODO: needs fedora error handling (e.g., bad password, hostname, etc)
        repo = Repository(self.args.fedora_root,
                          self.args.fedora_user, self.args.fedora_password)

        if self.args.pids:
            # if pids were specified on the command line, use those
            # get distinct pid list (only process each object once)
            object_pids = set(pid for pid in self.args.pids)
        else:
            # otherwise, process all find-able objects
            object_pids = list(repo.risearch.get_subjects(modelns.hasModel , self.object_model))

        # initalize progress bar
        pid_pbar = None
        total = self.args.max or len(object_pids)
        # init progress bar if available and we're checking enough objects
        if total >= 10 and ProgressBar and os.isatty(sys.stderr.fileno()):
            widgets = [Percentage(), ' (', SimpleProgress(), ')',
                       Bar(), ETA()]
            pid_pbar = ProgressBar(widgets = widgets, maxval=total).start()

        for pid in object_pids:
            obj = repo.get_object(pid = pid)
            if not obj.exists:
                print "Error: %s does not exist or is inaccessible" % pid
                continue
            
            for dsid in obj.ds_list.iterkeys():
                dsobj = obj.getDatastreamObject(dsid)
                self.stats['ds'] += 1

                if self.args.mode == 'validate':
                    self.validate_datastream(dsobj)
                    
                elif self.args.mode == 'repair':
                    self.repair_datastream(dsobj)
                
            self.stats['objects'] += 1
            
            if pid_pbar:
                pid_pbar.update(self.stats['objects'])

            # if interrupted or at a specified max, quit
            if self.interrupted or \
                   self.args.max and self.stats['objects'] == self.args.max:
                break

        if pid_pbar and not self.interrupted:
           pid_pbar.finish()

        # summarize what was done
        if self.args.mode == 'validate':
            self.validation_summary()
        elif self.args.mode == 'repair':
            self.repair_summary()

        # if a csv file was opened, close it
        if self.csv_file:
            self.csv_file.close()


    def validate_datastream(self, dsobj):
        if self.args.all_versions:
            # check every version of this datastream
            history = dsobj.obj.api.getDatastreamHistory(dsobj.obj.pid, dsobj.id)
            for ds in history.datastreams:
                self.check_datastream(dsobj, ds.createDate)
                self.stats['ds_versions'] += 1

        else:
            # current version only
            self.check_datastream(dsobj)

    def check_datastream(self, dsobj, date=None):
        '''Check the validity of a particular datastream.  Checks for
        invalid datastreams using
        :meth:`~eulfedora.models.DatastreamObject.validate_checksum`,
        and for no checksum (checksum type of ``DISABLED`` and
        checksum value of ``none``).  Optionally reports on the status
        and/or adds it to CSV file, depending on the arguments the
        script was called with.
        
        :param dsobj: :class:`~eulfedora.models.DatastreamObject` to
        be checked :param date: optional date/time for a particular
        version of the datastream to be checked; when not specified,
        the current version will be checked
        '''

        if not self.args.missing_only and not dsobj.validate_checksum(date=date):
            status = 'invalid'

        # if the checksum in fedora is stored as DISABLED/none,
        # validate_checksum will return True - but that may not be
        # what we want, so report as missing.

        # NOTE: this ignores date param (probably ok, but should document/clarify)
        elif dsobj.checksum_type == 'DISABLED' or dsobj.checksum == 'none':
            status = 'missing'
            
        else:
            status = 'ok'
            
        self.stats[status] += 1

        if status is not 'ok':
            if not self.args.quiet:
                print "%s/%s - %s checksum (%s)" % \
                      (dsobj.obj.pid, dsobj.id, status, date or dsobj.created)

            if self.csv:
                self.csv.writerow([dsobj.obj.pid, dsobj.id, date or dsobj.created, status,
                                   dsobj.mimetype, dsobj.versionable])

    def validation_summary(self):
        '''Summarize what was done when the script was run in
        **validation** mode.'''
        totals = '\nTested %(objects)d object(s), %(ds)d datastream(s)' % self.stats
        if self.args.all_versions:
            totals += ', %(ds_versions)d datastream version(s)' % self.stats
        print totals
        if not self.args.missing_only:
            print '%(invalid)d invalid checksum(s)' % self.stats
        print '%(missing)d datastream(s) with no checksum' % self.stats


    def repair_datastream(self, dsobj):
        '''Check for and repair a missing checksum on a single
        datastream.  If checksum type is ``DISABLED`` and checksum
        value is ``none``, update the checksum type and save the
        datastream, prompting Fedora to calculate a new checksum of
        the requested type.

        :param dsobj: :class:`~eulfedora.models.DatastreamObject`
        '''
        if dsobj.checksum_type == 'DISABLED' or dsobj.checksum == 'none':
            dsobj.checksum_type = self.args.checksum_type
            try:
                saved = dsobj.save('updating missing checksum')
                if saved:
                    self.stats['ds_updated'] += 1
            except Exception as e:
                print 'Error saving %s/%s : %s' % \
                      (dsobj.obj.pid, dsobj.id, e)
                self.stats['ds_err'] += 1

    def repair_summary(self):
        '''Summarize what was done when the script was run in
        **repair** mode.'''
        
        print '\nChecked %(objects)d object(s), updated %(ds_updated)d datastream(s)' % \
              self.stats
        if self.stats['ds_err']:
            print 'Error saving %(ds_err)d datastream(s)' % self.stats

    def interrupt_handler(self, signum, frame):
        '''Gracefully handle a SIGINT, if possible. Sets a flag so main script
        loop can exit cleanly, and restores the default SIGINT behavior,
        so that a second interrupt will stop the script.
        '''
        if signum == signal.SIGINT:
            # restore default signal handler so a second SIGINT can be used to quit
            signal.signal(signal.SIGINT, signal.SIG_DFL)
            # set interrupt flag so main loop knows to quit at a reasonable time
            self.interrupted = True
            # report if script is in the middle of an object
            print 'Script will exit after processing the current object.'
            print '(Ctrl-C / Interrupt again to quit immediately)'


class PasswordAction(argparse.Action):
    '''Use :meth:`getpass.getpass` to prompt for a password for a
    command-line argument.'''
    def __call__(self, parser, namespace, value, option_string=None):
        # if a value was specified on the command-line, use that
        if value:
            setattr(namespace, self.dest, value)
        # otherwise, use getpass to prompt for a password
        else:
            setattr(namespace, self.dest, getpass())


if __name__ == '__main__':
    FedoraChecksums().run()
