package Dpkg::Copyright::Scanner ;

use strict;
use warnings;

use 5.20.0;
use IO::Pipe;
use Exporter::Lite;
use Array::IntSpan;
use Path::Tiny;

use feature qw/postderef signatures/;
no warnings qw/experimental::postderef experimental::signatures/;

binmode STDOUT, ':utf8';

our @EXPORT = qw(scan_files print_copyright);

my $whitespace_list_delimiter = $ENV{'whitespace_list_delimiter'} || "\n ";

# license and copyright sanitisation pilfered from Jonas's
# licensecheck2dep5 Originally GPL-2+, permission to license this
# derivative work to LGPL-2.1+ was given by Jonas.
# see https://lists.alioth.debian.org/pipermail/pkg-perl-maintainers/2015-March/084900.html

# Copyright 2014 Dominique Dumont <dod@debian.org>
# Copyright © 2005-2012 Jonas Smedegaard <dr@jones.dk>
# Description: Reformat licencecheck output to copyright file format
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

sub print_copyright ( %args ) {
    my ($files, $copyrights_by_id) = scan_files(%args);

    # split file path and fill recursive hash, leaf is id
    my $split_files = {};
    foreach my $path (sort keys %$files) {
        __create_tree_leaf_from_paths ($split_files,$path,$files->{$path});
    }

    # regroup %files hash: all leaves have same id -> wild card
    __squash($split_files);

    # pack files by copyright id
    my @packed = __pack_files($split_files);

    my @out ;

    foreach my $p (@packed) {
        my ($id, @paths) = $p->@*;
        my ($c,$l) = $copyrights_by_id->[$id]->@*;

        next if $c eq 'no-info-found';

        # don't print directory info covered by same info in directory above
        next if $paths[0] =~ /\.$/;

        push @out,
            "Files: ", join($whitespace_list_delimiter, @paths )."\n",
            "Copyright: $c\n",
            "License: $l\n", "\n";
    }

    if ($args{out}) {
        $args{out}->spew_utf8( @out);
    }
    else {
        binmode(STDOUT, ":utf8");
        print @out;
    }
}

# option to skip UNKNOWN ?
# load a file to override some entries ?
sub scan_files ( %args ) {

    my @lines ;
    if (my $file = $ENV{COPYRIGHT_SCANNER_INPUT}) {
        @lines = path($file)->lines_utf8 ; # for tests
    }
    elsif ($args{in}) {
        @lines = $args{in}->lines_utf8; # for other tests
    }
    else {
        foreach my $opts ( ('',q! -c '(?i:readme|license|copying).*'!)) {
            my $pipe = IO::Pipe->new();
            $pipe->reader("licensecheck --copyright -m -r $opts .");
            binmode($pipe, ":encoding(UTF-8)");
            push @lines, $pipe->getlines;
            $pipe->close;
        }
    }

    my %copyrights ;
    my $files = {};
    my $id = 0;

    foreach my $line (sort @lines) {
        chomp $line;
        # say "found: $line";
        my ($f,$l,$c) = split /\t/, $line; 
        $f =~ s!\./!!;

        $l =~ s/([*?\\])/\\$1/g;
        $l =~ s/\s*\(unversioned\/unknown version\)//;
        $l =~ s/\s*\(with incorrect FSF address\)//;
        $l =~ s/(\w+)\s+\(v([^)]+) or v([^)]+)\)/uc($1)."-$2 or ".uc($1)."-$3"/e;
        $l =~ s/\s+\(v([^)]+) or later\)/-$1+/;
        $l =~ s/\s+\(v([^)]+)\)/-$1/;
        $l =~ s/^\s*(GENERATED FILE)/UNKNOWN/;
        $l =~ s/\s+(GENERATED FILE)//;
        $l =~ s/\bzlib\/libpng\b/Zlib/;
        $l =~ s/\bMIT\/X11 \(BSD like\)/Expat/;
        $l =~ s/\bBSD \((\d) clause\)/BSD-$1-clause/;
        $l =~ s/\bpublic domain\b/public-domain/i;

        # this is very fragile. may need to change license-check to output license keyword
        $l =~ s/ / or /g unless $l =~ /\bor\b/;

        $c =~ s/'//g;
        $c =~ s/^&copy;\s*//;
        $c =~ s/(?<=\b\d{4})\s*-\s*\d{4}(?=\s*-\s*(\d{4})\b)//g;
        $c =~ s/(\d+)\s*-\s*(\d+)/$1-$2/g;
        $c =~ s/\b(\d{4}),?\s+([\S^\d])/$1, $2/g;
        $c =~ s/\s+by\s+//g;
        $c =~ s/(\\n)*all\s+rights?\s+reserved\.?(\\n)*\s*//gi; # yes there are literal \n
        $c = 'no-info-found' if $c =~ /^\*No/;
        $c =~ s/\(r\)//g;
        $c =~ s!^[\s,/*]|[\s,/*-]+$!!g;
        $c =~ s/--/-/g;
        $c =~ s!\s+\*/\s+! !;

        $c = __pack_copyright($c);

        next if $l =~ /unknown/i and $c =~ /no-info-found/i;

        #say "Storing '$f' : '$c' '$l'";
        $files->{$f} = $copyrights{$c}{$l} //= $id++;
    }

    my @copyrights_by_id ;
    foreach my $c (sort keys %copyrights) {
        foreach my $l (sort keys $copyrights{$c}->%* ) {
            my $id = $copyrights{$c}{$l};
            $copyrights_by_id[$id] = [ $c, $l ] ;
        }
    }

    say "No copyright information found" unless keys %$files;

    my $merged_c_info = __squash_copyrights_years (\@copyrights_by_id) ;

    # replace the old ids with news ids
    __swap_merged_ids($files, $merged_c_info);

    # stop here for update ...
    return ($files, \@copyrights_by_id) ;
}

sub __split_copyright ($c) {
    my ($years,$owner) = $c =~ /([\s,\d-]+)(.*)/;
    # say "undef year in $c" unless defined $years;
    return unless defined $years;
    my @data = split /(?<=\d)[,\s]+/, $years;
    return unless defined $owner;
    $owner =~ s/^[\s.,-]+|[\s,*-]+$//g;
    return ($owner,@data);
}

sub __create_tree_leaf_from_paths ($h,$path,$value) {
    # explode path in subpaths
    my @subpaths = split '/', $path;
    my $last = pop @subpaths;
    map { $h = $h->{$_} ||= {} } @subpaths ;
    $h->{$last} = $value;
}

sub __pack_copyright ($r) {

    return $r if $r eq 'no-info-found';
    my %cop;
    $r =~ /^[\s\W]+|[\s\W]+$/g;
    foreach my $c ( split( m!\s+/\s+!, $r)) {
        my ($owner, @data) = __split_copyright($c);
        return $r unless defined $owner;
        $cop{$owner} ||= [] ;
        push $cop{$owner}->@*, @data ;
    }
    my @res ;
    foreach my $owner (sort keys %cop) {
        my $span = Array::IntSpan->new();
        my $data = $cop{$owner};
        foreach my $year ($data->@*) {
            return $r if $year =~ /[^\d-]/; # bail-out
            # take care of ranges written like 2002-3
            $year =~ s/^(\d\d\d)(\d)-(\d)$/$1$2-$1$3/;
            # take care of ranges written like 2014-15
            $year =~ s/^(\d\d)(\d\d)-(\d\d)$/$1$2-$1$3/;
            eval {
                $span->set_range_as_string($year, $owner);
            };
            if ($@) {
                say "Invalid year range in copyright: $r";
                return $r;
            }
        }
        $span->consolidate();
        push @res, $span->get_range_list. ', '. $owner;
    }
    return join("\n ",reverse sort @res);
}

#in each directory, pack files that have the same copyright/license information
# traverse recursively %h (whose structure matches the scanned directory)
# @path keeps track of the recursion depth to provide the file path
sub __pack_files ($h) {

    my @res ;
    __pack_dir($h,\@res) ;

    # sort by first path listed in there
    my $sort_path = sub {
        $a->[1] cmp $b->[1];
    };

    return sort $sort_path @res ;
}

sub __pack_dir ($h, $pack, @path) {
    my %pack_by_id;
    foreach my $file (sort keys %$h) {
        my $id = $h->{$file};
        if (ref($id)) {
            __pack_dir($id, $pack, @path, $file) ;
        }
        elsif (defined $pack_by_id{$id} ) {
            push $pack_by_id{$id}->@*, join('/',@path,$file);
        }
        else {
            $pack_by_id{$id} = [ join('/',@path,$file) ] ;
        }
    }

    push $pack->@*, map { [ $_, $pack_by_id{$_}->@* ];  } keys %pack_by_id ;
}

# find ids that can be merged together
# I.e. merge entries with same license and same set of owners. In this
# case the years are merged together.
sub __squash_copyrights_years ($copyrights_by_id) {

    my %id_year_by_same_owner_license;
    for (my $id = 0; $id < $copyrights_by_id->@* ; $id++ ) {
        my ($c,$l) = $copyrights_by_id->[$id]->@* ;
        #say "id $id: c $c l $l";
        my @owners ;
        my @years ;
        foreach my $line (split(/\n\s+/,$c)) {
            my ($owner, @year) = __split_copyright($line);
            next unless defined $owner;
            push @owners, $owner;
            push @years, join(',',@year);
        }
        my $k = join('|', $l, @owners);
        $id_year_by_same_owner_license{$k} //= [];
        push $id_year_by_same_owner_license{$k}->@*, [ $id, @years ];
    }

    my @merged_c_info;
    # now detect where %id_year_by_same_owner_license references more
    # than one id this means that several entries can be merged in a
    # *new* id (new id to avoid cloberring data of other directories)
    foreach my $owner_license (sort keys %id_year_by_same_owner_license) {
        my @entries =  $id_year_by_same_owner_license{$owner_license}->@* ;
        next unless @entries > 1;

        my ($l,@owners) = split /\|/, $owner_license;

        # create new copyright info with coaslesced years
        my @squashed_c = __coalesce_copyright_years(\@entries,\@owners) ;
        next unless @squashed_c ; # give up this entry when problem

        # store (c) info with coalesced years in new item of $copyrights_by_id
        my $new_id = $copyrights_by_id->@* ;
        my $new_cop = join("\n ",@squashed_c) ;
        $copyrights_by_id->[$new_id] = [ $new_cop , $l ];
        #say "created id $new_id with c $new_cop l $l";
        # fill the swap table entry-id -> coaslesces entry-id
        foreach my $id ( map { $_->[0]} @entries) {
            $merged_c_info[$id] = $new_id;
        }
    }

    return \@merged_c_info;
}

sub __swap_merged_ids ($files, $merged_c_info) {
    foreach my $name (sort keys %$files) {
        my $item = $files->{$name};
        if (ref($item)) {
            __swap_merged_ids($item,$merged_c_info);
        }
        elsif (my $new_id = $merged_c_info->[$item]) {
            $files->{$name} = "$new_id"  ;
        }
    }
}

sub __coalesce_copyright_years($entries, $owners) {
    my @ranges_of_years ;
    # $entries and $owners always have the same size

    foreach my $entry (@$entries) {
        my ($id, @years) = $entry->@* ;

        for (my $i = 0; $i < @years; $i++) {
            return () if $years[$i] =~ /[^\d,\s-]/;
            my $span = $ranges_of_years[$i] //= Array::IntSpan->new();
            return () unless $span; # bail out in case of problems
            eval {
                $span->set_range_as_string($years[$i], 1);
            };
            if ($@) {
                say "Invalid year range: ",$years[$i];
                return ();
            }
        }
    }

    my @squashed_c;
    for (my $i=0; $i < @$owners ; $i++) {
        $ranges_of_years[$i]->consolidate();
        $squashed_c[$i] = $ranges_of_years[$i]->get_range_list.', '.$owners->[$i];
    }

    return @squashed_c;
}

# $h is a tree of hash matching the directory structure. Each leaf is a
# copyright id. Each key is a file name in a directory (not the full path)
sub __squash ($h) {
    my %count ;

    # find main license info found in LICENCE or COPYING or README
    # file
    foreach my $info_name (qw/readme copying license/) {
        my $re = qr!$info_name[.\w]*$!i;
        foreach my $name (sort keys %$h) {
            if ($name =~ $re) {
                my $id = delete $h->{$name};
                #say "del global lic info $name with ".Dumper($id);
                # using 1 means that info from this file is easy to override
                $count{$id} = 1;
            };
        }
    }

    # count the number of times each (c) info is used in this directory.
    # (including the main (c) info of each subdirectory)
    foreach my $name (sort keys %$h) {
        my $item = $h->{$name};
        if (ref($item)) {
            # squash may return a plain id, or a hash with '*' => id ,
            # or a non squashable hash
            $h->{$name} = __squash($item);
        }
        my $id = (ref($item) and defined $item->{'*'}) ? $item->{'*'} : $item ;

        # do not count non squashable hashes (i.e. there's no main (c) info)
        if (not ref ($id)) {
            $count{$id}//=0;
            $count{$id} ++;
        }
    }

    # find the most used (c) info in this directory (or the existing '*' entry)
    # unless info was already found in LICENSE or README content
    my $max = 0;
    my $main_license_id = $h->{'*'};
    if (not defined $main_license_id) {
        foreach my $id (sort keys %count) {
            if ($count{$id} > $max) {
                $max = $count{$id};
                $main_license_id = $id ;
            }
        }
    }

    # all files associated to the most used (c) info are deleted to
    # be represented by '*' entry
    foreach my $name (sort keys %$h) {
        my $item = $h->{$name};
        if (ref($item) and defined $item->{'*'} and $item->{'*'} == $main_license_id) {
            # rename item/* to item/. when covered by ./*
            # this is a "weak" directory info which is handled specially
            $item->{'.'} = delete $item->{'*'};
        }
        if (not ref ($item)) {
            # delete file that is represented by '*' entry
            delete $h->{$name} if $item == $main_license_id;
        }
    }
    # here's the '*' file representing the most used (c) info
    $h->{'*'} //= $main_license_id if defined $main_license_id;

    return $h;
}

1;

__END__

=head1 NAME

 Dpkg::Copyright::Scanner - Scan files to provide copyright data

=head1 SYNOPSIS

 use Dpkg::Copyright::Scanner qw/print_copyright scan_files/;

 # print copyright data on STDOUT
 print_copyright;

 # return a data structure containing copyright information
 my @copyright_data = scan_files();


=head1 DESCRIPTION

This modules scans current package directory to extract copyright and
license information. Information are packed in a way to ease review and
maintenance. Files information is grouped with wildcards ('*') to reduce
the list of files.

=head1 METHODS

=head2 print_copyright

Print copyright information on STDOUT like L<scan-copyrights>.

=head2 scan_files

Return a data structure with copyright and license information.

The structure is a list of list:

 [
   [
     [ path1 ,path2, ...],
     copyright,
     license_short_name
   ],
   ...
 ]

Example:

 [
  [
    [ '*' ],
    '1994-2001, by Frank Pilhofer.',
    'GPL-2+'
  ],
  [
    [ 'pan/*' ],
    '2002-2006, Charles Kerr <charles@rebelbase.com>',
    'GPL-2'
  ],
  [
    [
      'pan/data/parts.cc',
      'pan/data/parts.h'
    ],
    '2002-2007, Charles Kerr <charles@rebelbase.com>',
    'GPL-2'
  ],
 ]

=head1 Encoding

The output of L<licensecheck> is expected to be utf-8. Which means
that the source files scanned by L<licensecheck> should also be
encoded in utf-8. In practice, this will impact only copyright owner
name which may be garbled if comments are not encoded in utf-8.

=head1 BUGS

Extracting license and copyright data from unstructured comments is not reliable.
User must check manually the files when no copyright info is found or when the
license is unknown.

=head1 SEE ALSO

L<licensecheck>, C<licensecheck2dep5> from C<cdbs> package

=head1 AUTHOR

Dominique Dumont <dod@debian.org>

=cut

