package Debian::Packages::Compare;
use warnings;
use strict;
use Carp;
use IO::File;
use File::Basename;
use Parse::DebControl;
use Parse::Debian::Packages;
use IO::Uncompress::Gunzip qw(gunzip $GunzipError) ;
use Exporter;
use vars qw / $base %complain /;

=pod

=head1 Name

Debian::Packages::Compare - emdebian repository comparison support

=cut

=head1 VERSION

Version 0.2.0

=cut

use vars qw(@ISA @EXPORT);
@ISA=qw(Exporter);
@EXPORT=qw(read_locale read_packages read_sources set_base
 get_missing_sources get_missing_binaries get_britney_list
 get_single_package get_archlist get_locale_roots get_base
 get_components get_britney_complaint get_missing_builddeps );

our $OURVERSION = "0.2.1";

=head1 Synopsis

Read the Packages file, create a hash of Package and version -
once per arch, one per suite, one pair per repo. The underlying
libparse-debian-packages-perl is a very simple module, there probably
isn't any point putting extra data into the comparison hashes, get the
data necessary and load the full Packages data separately using
get_single_package.

This module is currently tied to the repository layout used by
reprepro in order to identify the architecture list and various
other pieces of meta-data. In time, functions can be added to
provide such lists.

The module expects to find all repositories beneath
a single base directory:

 $base/$repo_name/conf/distributions

If $base is undefined or if the repository directories cannot be
found, subsequent functions each return undef.

etc.

Example code:

 use Debian::Packages::Compare;

 my $base = '/path/to/repositories/'
 &set_base($base);

 my $debu  = &read_packages ('unstable', 'filter');
 my $gripu = &read_packages ('unstable', 'grip');
 my %debianunstable = %$debu  if (defined $debu);
 my %gripunstable   = %$gripu if (defined $gripu);
 foreach my $pkg (sort keys %debianunstable)
 {
   if ($debianunstable{$pkg}{'source'} ne $gripunstable{$pkg}{'source'})
   {
     # $pkg is either not in grip repository or is a different version
   }
 }

The main features for subsequent versions will involve more verbose
error handling.

 perl -MDebian::Packages::Compare -MData::Dumper -e '&set_base("/opt/reprepro"); \
  $c = &read_packages("testing","grip"); print Dumper ($$c{"dpkg"});';

=cut

=head1 get_archlist

Requires the reprepro file layout, currently.

Reads the supported architectures and returns a list.

=cut

sub get_archlist
{
	my (@stanzas, $parser, $conf, $stanza);
	my ($suite, $repo) = @_;
	my @archlist=();
	return undef unless defined $base;
	$parser = new Parse::DebControl;
	my $input = "$base/$repo/conf/distributions";
	$input =~ s://:/:g;
	return undef unless (-f $input);
	$conf = $parser->parse_file("$input", {'stripComments' => 'true'});
	@stanzas = @$conf;
	foreach $stanza (@stanzas)
	{
		if (($$stanza{'Codename'} eq "$suite")
			or ($$stanza{'Suite'} eq "$suite"))
		{
			my $line = $$stanza{'Architectures'};
			@archlist = split(' ', $line);
		}
	}
	@archlist = sort (@archlist);
	return \@archlist;
}

=head1 get_locale_roots

Requires the reprepro file layout, currently.

Reads the supported locale roots and returns a list.

=cut

sub get_locale_roots
{
	my (@stanzas, $parser, $conf, $stanza);
	my ($suite, $repo) = @_;
	my @locroots=();
	return undef unless defined $base;
	$parser = new Parse::DebControl;
	my $input = "$base/$repo/conf/distributions";
	$input =~ s://:/:g;
	return undef unless (-f $input);
	$conf = $parser->parse_file("$input", {'stripComments' => 'true'});
	@stanzas = @$conf;
	foreach $stanza (@stanzas)
	{
		if (($$stanza{'Codename'} eq "$suite")
			or ($$stanza{'Suite'} eq "$suite"))
		{
			my $line = $$stanza{'Components'};
			@locroots = split(' ', $line);
		}
	}
	return \@locroots;
}

=head1 get_components

Similar to get_locale_roots but for ordinary repositories so that
component splits (like dev and doc in Emdebian Grip) can be implicitly
supported. Note that repositories with contrib or non-free will
likely get confused results with this kind of support.

=cut

sub get_components
{
	my (@stanzas, $parser, $conf, $stanza);
	my ($suite, $repo) = @_;
	my @cmpnts=();
	return undef unless defined $base;
	$parser = new Parse::DebControl;
	my $input = "$base/$repo/conf/distributions";
	$input =~ s://:/:g;
	return undef unless (-f $input);
	$conf = $parser->parse_file("$input", {'stripComments' => 'true'});
	@stanzas = @$conf;
	foreach $stanza (@stanzas)
	{
		if (($$stanza{'Codename'} eq "$suite")
			or ($$stanza{'Suite'} eq "$suite"))
		{
			my $line = $$stanza{'Components'};
			@cmpnts = split(' ', $line);
		}
	}
	return \@cmpnts;
}

=head1 set_base

Set the base directory of the repository (repositories) to be
compared. The module expects to find all repositories beneath
a single base directory:

 $base/$repo_name/dists/$suite/main/binary-$arch/Packages
 $base/$repo_name/dists/$suite/main/source/Sources.gz

=cut

sub set_base
{
	$base = shift;
}

sub get_base
{
	return $base;
}

=head1 read_sources

Similar to read_binaries but expects a GZip compressed Sources.gz
file. Not usually called directly.

=cut

sub read_sources
{
	my ($suite, $repo) = @_;
	if (not defined $base)
	{
		print "Base directory has not been set.";
		return undef;
	}
	my %package=();
	my %list=();
	# Sources are not split into components at this time.
	my $input = "$base/$repo/dists/$suite/main/source/Sources.gz";
	$input =~ s://:/:g;
	if (not -f $input)
	{
		print "Cannot find Sources.gz: $input\n";
		return undef;
	}
	my $z = new IO::Uncompress::Gunzip $input
			or die "IO::Uncompress::Gunzip failed: $GunzipError\n";
	my $parser = Parse::Debian::Packages->new( $z );
	while (%package = $parser->next)
	{
		$list{$package{'Package'}}{'source'}=$package{'Version'};
		my $src = (not defined $package{'Source'}) ?
			$package{'Package'} :
			$package{'Source'};
		$src =~ s/\(.*\)//g;
		$src =~ s/ //g;
		$list{$package{'Package'}}{'Src'}=$src;
		my $dep = $package{'Build-Depends'};
		chomp ($dep) if (defined $dep);
		$list{$package{'Package'}}{'Build-Depends'}=$dep;
	}
	return \%list;
}

=head1 read_binaries

Reads the Packages file and creates a hash of the packages,
architectures and versions in the respective repository
(under the C<$base> directory) and suite for the defined C<@archlist>,
including details of Sources.

Takes two parameters - the suite name and the repository name.

Returns undef if the Packages file cannot be found.

e.g. for the Grip repository:

 $hash{'sed'}{'source'} => '4.1.5-8'
 $hash{'sed'}{'armel'} => '4.1.5-8em1'
 ...

=cut

sub read_packages
{
	my ($suite, $repo) = @_;
	return undef unless defined $base;
	my $src = (read_sources ($suite, $repo));
	if (not defined $src)
	{
		warn ("No sources found in '$repo' repository.\n");
		return undef;
	}
	my %list = %$src;
	undef $src;
	$src = get_archlist ($suite, $repo);
	my @archlist = @$src;
	foreach my $arch (@archlist)
	{
		next if ($arch eq 'source');
		my %package=();
		my ($parser, $fh);
		# support components other than main.
		my $c = get_components ($suite, $repo);
		foreach my $cmpnt (@$c)
		{
			my $file = "$base/$repo/dists/$suite/$cmpnt/binary-${arch}/Packages";
			$file =~ s://:/:g;
			if (not -f $file)
			{
				warn ("Cannot find Packages file: '$file'.\n");
				next;
			}
			$fh = IO::File->new("$file") or die "$!\n";
			$parser = Parse::Debian::Packages->new( $fh );
			while (%package = $parser->next)
			{
				$list{$package{'Package'}}{"$arch"}=$package{'Version'};
				my $src = (not defined $package{'Source'}) ?
					$package{'Package'} :
					$package{'Source'};
				$src =~ s/\(.*\)//g;
				$src =~ s/ //g;
				$list{$package{'Package'}}{'Src'}=$src
					if (not defined $list{$package{'Package'}}{'Src'});
			}
			$fh->close;
			undef $fh;
		}
	}
	return \%list;
}

=head1 read_locale

Specialised function that handles the locale-root organisation of the
TDeb repository. The internal locale root component structure is hidden
in the returned hash, the component name is listed separately and
contained in the package name. Source data is retained under the
original source package name and referenced in the hash:

The locale root is contained in C<$list{$package}{'locale'}>
and the source package name in C<$list{$package}{'source'}>

e.g.

 $hash{'sed-locale-pt-br}{'source'} => 'sed'
 $hash{'sed-locale-pt-br}{'locale'} => 'pt'
 $hash{'sed-locale-pt-br}{'armel'} => '4.1.5-8'
 ...
 $hash{'sed'}{'source'} => '4.1.5-8'

=cut

sub read_locale
{
	my ($suite, $repo) = @_;
	return undef unless defined $base;
	my $src = (read_sources ($suite, $repo));
	return undef unless (defined $src);
	my %list = %$src;
	undef $src;
	$src = get_archlist ($suite, $repo);
	my @archlist = @$src;
	undef $src;
	$src = get_locale_roots ($suite, 'locale');
	my @locroots = @$src;
	undef $src;
	return undef unless (@archlist);
	return undef unless (@locroots);
	foreach my $cmpnt (@locroots)
	{
		foreach my $arch (@archlist)
		{
			next if ($arch eq 'source');
			my %package=();
			my ($parser, $fh);
			my $file = "$base/$repo/dists/$suite/$cmpnt/binary-${arch}/Packages";
			$file =~ s://:/:g;
			if (not -f $file)
			{
				warn ("Cannot find Packages file: '$file'.\n");
				next;
			}
			$fh = IO::File->new("$file");
			$parser = Parse::Debian::Packages->new( $fh );
			while (%package = $parser->next)
			{
				$list{$package{'Package'}}{"$arch"}=$package{'Version'};
				$list{$package{'Package'}}{'locale'}=$cmpnt;
				$list{$package{'Package'}}{'source'}=$package{'Source'}
			}
			$fh->close;
			undef $fh;
		}
	}
	return \%list;
}

=head1 get_single_package

Retrieve the full Packages record for a single package,
binary (single architecture) or source (.dsc).

=cut

sub get_single_package
{
	my ($suite, $repo, $pkg, $arch) = @_;
	my ($parser, $fh, %list, %package);
	return undef unless defined $base;
	if ($arch ne 'source')
	{
		# support components other than main.
		my $c = get_components ($suite, $repo);
		foreach my $cmpnt (@$c)
		{
			my $file = "$base/$repo/dists/$suite/$cmpnt/binary-${arch}/Packages";
			$file =~ s://:/:g;
			return undef if (not -f $file);
			$fh = IO::File->new("$file") or die "$!\n";
			$parser = Parse::Debian::Packages->new( $fh );
			while (%package = $parser->next)
			{
				%list = (%package) if ($package{'Package'} eq $pkg);
			}
			$fh->close;
			undef $fh;
		}
		return \%list;
	}
	# support components other than main.
	my $c = get_components ($suite, $repo);
	foreach my $cmpnt (@$c)
	{
		my $input = "$base/$repo/dists/$suite/$cmpnt/source/Sources.gz";
		$input =~ s://:/:g;
		return undef unless (-f $input);
		my $z = new IO::Uncompress::Gunzip $input
				or die "IO::Uncompress::Gunzip failed: $GunzipError\n";
		$parser = Parse::Debian::Packages->new( $z );
		while (%package = $parser->next)
		{
			%list = (%package) if ($package{'Package'} eq $pkg);
		}
	}
	return \%list;
}

=head1 get_missing_sources

Compare two repositories for the same suite and return a list of
source packages that are in the first repository but not in the
second. (Older versions are ignored.)

TODO: need a get_outdated_sources and get_outdated_binaries too.

=cut

sub get_missing_sources
{
	my ($suite, $repo1, $repo2) = @_;
	return undef unless (defined $suite and defined $base and
		defined $repo1 and defined $repo2);
	my $src = (read_sources ($suite, $repo1));
	my %orig = %$src;
	undef $src;
	$src = (read_sources ($suite, $repo2));
	my %comp =%$src;
	my %list=();

# BUG: needs to look for old sources as well as missing ones.

	foreach my $pkg (sort keys %orig)
	{
		# convert a binary to a source to ensure each binary
		# has a source package
		my $src = $orig{$pkg}{'Src'};
		next if (not defined $orig{$pkg}{'source'});
		if (not defined $comp{$pkg}{'source'})
		{
			$list{$pkg} = $orig{$pkg};
			next;
		}
	}
	return \%list;
}

sub get_missing_builddeps
{
	my ($suite, $repo1, $repo2) = @_;
	return undef unless (defined $suite and defined $base and
		defined $repo1 and defined $repo2);
	my $src = (read_sources ($suite, $repo1));
	my %orig = %$src;
	undef $src;
	$src = (read_sources ($suite, $repo2));
	my %comp =%$src;
	$src = (read_packages ($suite, $repo1));
	my %bin = %$src;
	my %list=();
	foreach my $pkg (sort keys %orig)
	{
		my $deps = $orig{$pkg}{'Build-Depends'};
		next if not defined ($deps);
		my @d = split(",", $deps);
		foreach my $dd (sort @d)
		{
			if ($dd =~ /\|/)
			{
				my @ors = split('\|', $dd);
				foreach my $or (@ors)
				{
					$or =~ s/^ +//;
					$or =~ s/ +$//;
					$or =~ s/ *\[.*\] *//;
					$or =~ s/ *\(.*\) *//;
					next if ($or =~ /^not\+/);
					next if (defined $orig{$or}{'source'});
					$list{$or}++ if (not defined $bin{$or});
				}
				next;
			}
			$dd =~ s/^ +//;
			$dd =~ s/ +$//;
			$dd =~ s/ *\[.*\] *//;
			$dd =~ s/ *\(.*\) *//;
			$dd =~ s/ *\(.*\) *//;
			next if (defined $orig{$dd}{'source'});
			$list{$dd}++ if (not defined $bin{$dd});
		}
		$deps = join (" ", sort keys %list);
	}
	return \%list;
}

=head1 get_missing_binaries

Compare two repositories for the same suite and return a list of
binary packages that exist in both repositories as source packages
but only exist as binary packages in the first repository, not in
the second.

Emdebian version suffixes are automatically cleared in the
comparison.

=cut

sub get_missing_binaries
{
	my ($suite, $repo1, $repo2) = @_;
	return undef unless (defined $suite and defined $base and
		defined $repo1 and defined $repo2);
	my ($orig_vers, $comp_vers);
	my $src = (read_packages ($suite, $repo1));
	my %orig = %$src;
	undef $src;
	$src = (read_packages ($suite, $repo2));
	my %comp =%$src;
	undef $src;
	# if the archlist differs, it is only a problem if
	# repo2 supports more architectures than repo1 which
	# doesn't really make a lot of sense.
	$src = get_archlist($suite, $repo1);
	my @archlist = @$src;
	my %list=();
	foreach my $pkg (sort keys %orig)
	{
		foreach my $arch (@archlist)
		{
			next if ($arch eq 'source');
			$orig_vers = $orig{$pkg}{$arch};
			# probably a source-only package.
			next unless (defined $orig_vers);
			$comp_vers = (defined $comp{$pkg}{$arch}) ? $comp{$pkg}{$arch} : '';
			$comp_vers =~ s/em[0-9]$//;
			next if ($orig_vers eq $comp_vers);
			$list{$pkg}{$arch}{$repo1} = $orig_vers;
			$list{$pkg}{$arch}{$repo2} = $comp_vers;
		}
	}
	return \%list;
}

=head1 get_britney_list

Compare two repositories, each with unstable and testing,
return a list of source packages that are suitable for migration.

To be suitable for migration, a package must exist in the first
repository at the same version in both unstable and testing. It must
also exist in the second repository for unstable and be the same
version as in the first repository. Finally, the source package
must either be absent from testing in the second repository or
be at a lower version than in unstable.

This is a Debian-only feature and no support is available for
repositories that do not implement unstable and testing suites in
precisely the same manner as Debian.

All data is reloaded fresh each time the function is run.

 1. repo1 unstable must match repo1 testing
 2. repo1 unstable must match repo2 unstable
 3. repo2 unstable must B<be newer> than repo2 testing
 4. All architectures are compared, including source.

Returns undef in case of error.

The returned hash is indexed under the source package name.

To get the list of packages that raised a complaint about
missing sources, call get_britney_complaint - the returned
list will be undefined unless get_britney_list has already
been called.

=cut

sub get_britney_list
{
	my $str;
	%complain=();
	my ($repo1, $repo2) = @_;
	return undef unless defined $base;
	return undef unless (defined $repo1 and defined $repo2);
	my $suite = 'unstable';
	my $retval = -9;
	my $src = (read_packages ($suite, $repo1));
	return undef unless (defined $src);
	my %origunstable = %$src;
	undef $src;
	$src = (read_packages ($suite, $repo2));
	return undef unless (defined $src);
	my %compunstable = %$src;
	undef $src;
	$suite = 'testing';
	$src = (read_packages ($suite, $repo1));
	return undef unless (defined $src);
	my %origtesting = %$src;
	undef $src;
	$src = (read_packages ($suite, $repo2));
	return undef unless (defined $src);
	my %comptesting = %$src;
	undef $src;
	$src = get_archlist($suite, $repo1);
	my @archlist = @$src;
	undef $src;
	my %list=();
	foreach my $pkg (sort keys %origunstable)
	{
		my $skip;
		foreach my $arch (@archlist)
		{
			# check criterion 1.
			next if (not defined $origtesting{$pkg}{$arch});
			$skip++ if ($origunstable{$pkg}{$arch} ne $origtesting{$pkg}{$arch});
		}
		next if (defined $skip);
		undef $skip;
		# check criterion 2.
		my $src= $origunstable{$pkg}{'Src'};
		chomp ($src);
		$src =~ s/ //g;
		if (not defined $compunstable{$src}{'source'})
		{
			$complain{$src}++;
			$str = "ERR: Cannot find a source package '$src' for $pkg ".
			"in Grip unstable.\n";
			warn ($str);
			$skip++;
			next;
		}
		$skip++ if ($origunstable{$src}{'source'} ne $compunstable{$src}{'source'});
		if (defined $skip)
		{
			$complain{$src}++;
			print "$src (source) is out of date in $repo2 unstable: " .
				"$origunstable{$src}{'source'} ne $compunstable{$src}{'source'}\n";
			next;
		}
		undef $skip;
		foreach my $arch (@archlist)
		{
			# criterion 3.
			# check the actual result with dpkg --compare-versions
			# retain any Emdebian version suffixes - comparing within
			# the same repo
			next if (not defined $compunstable{$pkg}{$arch});
			if (not defined $comptesting{$pkg}{$arch})
			{
				$list{$pkg}{$arch}{'unstable'} = $compunstable{$pkg}{$arch};
				next;
			}
			$retval = system ("dpkg --compare-versions ".
				$compunstable{$pkg}{$arch}." '>>' ".$comptesting{$pkg}{$arch});
			if ($retval == 0)
			{
				$list{$pkg}{$arch}{'unstable'} = $compunstable{$pkg}{$arch};
				$list{$pkg}{$arch}{'testing'} = $comptesting{$pkg}{$arch};
			}
			$retval /= 256;
			if ($retval == 1)
			{
				$retval = system ("dpkg --compare-versions ".
					$compunstable{$pkg}{$arch}." '>=' ".
					$comptesting{$pkg}{$arch});
				if ($retval == 1)
				{
					$list{$pkg}{$arch}{'unstable'} = $compunstable{$pkg}{$arch};
					$list{$pkg}{$arch}{'testing'} = $comptesting{$pkg}{$arch};
				}
			}
		}
	}
	if (scalar (keys %complain) > 0)
	{
		$str = (scalar (keys %complain) == 1) ?
			"\tAdd the missing source package explicitly ".
				"using\n\tem_autogrip -b $base -s ":
			"\tAdd the missing source packages explicitly ".
				"using\n\tem_autogrip -b $base --noskipold -s ";
		$str .= join (" ", sort (keys %complain));
		$base = get_base;
		$str .= "\nIn some situations, this can indicate that the ".
		"filter repository\nis broken and the package may have to be ".
		"removed from the filter\nand re-added. e.g.\n".
		"reprepro -b ${base}\$filter removesrc unstable \$source\n".
		"reprepro -b ${base}\$filter removesrc testing \$source\n".
		"You may prefer to skip certain source packages (e.g. lsb) due\n".
		"to dependency issues with the binary package of the same name.\n";
		warn ($str);
	}
	return \%list;
}

=head1 get_britney_complaint

If get_britney_list comes up with source packages that are missing
for a testing migration, get_britney_complaint returns the list
of source package names that were identified.

If get_britney_list has not been called, returns undef.

If there are no complaints, also returns undef.

=cut

sub get_britney_complaint
{
	return undef if (not defined %complain);
	return undef if (scalar (keys %complain) == 0);
	my @c = (sort keys %complain);
	return \@c;
}

=head1 Copyright and Licence

 Copyright (C) 2008  Neil Williams <codehelp@debian.org>

 This package is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 3 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.

=cut

=head1 AUTHOR

Neil Williams, C<< <codehelp@debian.org> >>

=cut

=head1 BUGS

Please report any bugs or feature requests to the Debian Bug Tracking
System using C<reportbug libdebian-packages-compare-perl>.

=cut

1;
