package Daizu::Revision;
use warnings;
use strict;
use base 'Exporter';
our @EXPORT_OK = qw(
load_revision
known_branches branch_and_path file_guid
);
use SVN::Core;
use SVN::Ra;
use SVN::Repos;
use SVN::Delta;
use Carp qw( croak );
use Carp::Assert qw( assert DEBUG );
use Daizu::Util qw(
like_escape
validate_uri db_datetime
db_row_exists db_select db_insert db_update transactionally
mint_guid get_subversion_properties
);
=head1 NAME
Daizu::Revision - functions for loading revisions from Subversion
=head1 DESCRIPTION
These functions are used to load metadata about revisions (and the file
path changes made in them) from the Subversion into the PostgreSQL database.
=head1 FUNCTIONS
The following functions are available for export from this module.
None of them are exported by default.
=over
=item load_revision($cms, $desired_revnum)
Load information about new revisions, up to C<$desired_revnum>.
It starts from the revision after the last one which was loaded, and
is idempotent (so if you try to load the same revision twice there will
be no change).
If C<$desired_revnum> is not defined, loads up to the most recent revision
in the repository.
This can also be called as a method on a L<Daizu> object.
=cut
sub load_revision
{
my ($cms, $desired_rev) = @_;
croak "bad revision number r$desired_rev"
if defined $desired_rev && $desired_rev < 1;
return transactionally($cms->{db}, \&_load_revision_txn,
$cms, $desired_rev);
}
sub _load_revision_txn
{
my ($cms, $desired_rev) = @_;
my $db = $cms->{db};
my $ra = $cms->{ra};
my $latest_rev = $ra->get_latest_revnum;
$desired_rev = $latest_rev
unless defined $desired_rev;
croak "can't load up to r$desired_rev, latest revision is r$latest_rev"
if $desired_rev > $latest_rev;
my $last_known_rev = db_select($db, revision => {}, 'max(revnum)');
$last_known_rev ||= 0;
assert($last_known_rev <= $latest_rev) if DEBUG;
# Return quickly if there's nothing to do.
return $last_known_rev
if $last_known_rev >= $desired_rev;
my $branches = known_branches($db);
my $trunk_id = $branches->{trunk};
for my $revnum (($last_known_rev + 1) .. $desired_rev) {
my @modified;
my (@added, @copied, @deleted);
my $date;
# Gather information about the changes in this revision.
# The database is only updated later, after the callback is finished,
# because we might need to get other information from $ra, and that's
# not allowed while get_log() is running.
$ra->get_log('/', $revnum, $revnum, 0, 1, 1, sub {
my ($paths, undef, undef, $rev_date) = @_;
$date = $rev_date;
while (my ($full_path, $changes) = each %$paths) {
my $action = $changes->action;
# Modified files don't affect identity, we just need to record
# their paths so that we can check for property changes that
# might affect the GUID URI. Only changes on trunk affect it.
if ($action eq 'M') {
next unless $full_path =~ s!^/trunk/!! && $full_path ne '';
push @modified, $full_path;
next;
}
my ($branch_id, $path) = branch_and_path($branches, $full_path);
# Ignore files which live outside branches, except when we
# see a new branch being created, in which case make a note
# of where it is in the 'branch' table. We recognize a new
# branch as a copy from an existing branch to a new location
# outside any existing branches.
if (!defined $branch_id) {
my $from = $changes->copyfrom_path;
next unless defined $from;
my $to = $full_path;
$_ =~ s!^/!! for $from, $to;
next unless exists $branches->{$from};
$path = '';
$branch_id = db_insert($db, 'branch', path => $to);
$branches->{$to} = $branch_id;
}
if ($action ne 'D') { # add or replace
my $from_full_path = $changes->copyfrom_path;
if (defined $from_full_path) {
croak "Error in revision $revnum: file $full_path copied from root directory"
if $from_full_path eq '/';
my ($from_branch_id, $from_path) =
branch_and_path($branches, $from_full_path);
croak "Error in revision $revnum: file $full_path copied from unknown branch"
unless defined $from_branch_id;
push @copied, {
full_path => $full_path,
path => $path,
branch_id => $branch_id,
from_full_path => $from_full_path,
from_path => $from_path,
from_branch_id => $from_branch_id,
from_rev => $changes->copyfrom_rev,
};
}
else {
push @added, [ $branch_id, $path, $full_path ];
}
}
if ($action ne 'A') { # delete or replace
push @deleted, [ $branch_id, $path ];
}
}
});
_add_revision($db, $revnum, $date);
_revision_guid_modifications($ra, $db, $trunk_id, $revnum, \@modified)
if @modified;
_revision_guid_path_changes($cms, $ra, $db, $trunk_id, $revnum,
\@added, \@copied, \@deleted)
if @added || @copied || @deleted;
}
return $desired_rev;
}
=item known_branches($db)
Return a reference to a hash of known branches. The keys are the paths,
and the values are the ID numbers found in the C<branch> table.
Dies if it can't find a branch with the path C<trunk>, because that indicates
a broken database.
=cut
sub known_branches
{
my ($db) = @_;
my $sth = $db->prepare('select id, path from branch');
$sth->execute;
my %branch;
while (my ($id, $path) = $sth->fetchrow_array) {
$branch{$path} = $id;
}
croak "there is no branch called 'trunk' in the database"
unless exists $branch{trunk};
return \%branch;
}
=item branch_and_path($branches, $path)
Return a list of two values, the ID number and path of the branch
which a file at C<$path> would be in, whether or not it actually exists.
The path should be relative to the root of the repository, for example
C<trunk/foo.html>. It doesn't mater whether C<$path> starts with a forward
slash.
Returns nothing if the path is not in any branch, in which case Daizu CMS
will simply ignore it.
=cut
sub branch_and_path
{
my ($branches, $path) = @_;
$path =~ s/^\///;
return if $path eq '/'; # Don't care about root directory
# Figure out which branch this path is on. Do this by checking
# ever longer prefixes of the path, since that will allow us to
# find 'trunk' very quickly.
my @path = split '/', $path;
my $branch_path = '';
my $branch_id;
while (@path) {
$branch_path .= '/' unless $branch_path eq '';
$branch_path .= shift @path;
next unless exists $branches->{$branch_path};
$branch_id = $branches->{$branch_path};
last;
}
# Ignore changes to files which aren't in a branch we know about.
return unless defined $branch_id;
# The file/directory path relative to the branch directory.
# Empty string for the top level directory.
$path = $path eq $branch_path ? ''
: substr $path, length($branch_path) + 1;
return ($branch_id, $path);
}
=item file_guid($db, $branch_id, $path, $revnum)
Returns a reference to a hash of information about the GUID for the file
in branch C<$branch_id> at C<$path> in revision C<$revnum>, or C<undef>
if there is/was no such file.
The hash will contain the following keys:
=over
=item id
GUID ID.
=item is_dir
True iff the associated file is a directory.
=item uri
The GUID URI (usually starting with C<tag:>). This will be the custom
GUID URI if overridden by a C<daizu:guid> property.
=item custom_uri
True if a C<daizu:guid> property has overridden the automatically
generated GUID URI.
=item first_revnum
The number of the first revision in which C<$path> was used for this
file in this branch.
=item last_revnum
The number of the last revision in which C<$path> was used for this
file in this branch, or C<undef> if it is still being used in the most
recently loaded revision.
=back
=cut
sub file_guid
{
my ($db, $branch_id, $path, $revnum) = @_;
return $db->selectrow_hashref(q{
select g.id, g.is_dir, g.uri, g.old_uri, g.custom_uri,
p.first_revnum, p.last_revnum
from file_guid g
inner join file_path p on g.id = p.guid_id
where p.branch_id = ?
and p.path = ?
and p.first_revnum <= ?
and (p.last_revnum is null or p.last_revnum >= ?)
}, undef, $branch_id, $path, $revnum, $revnum);
}
sub _add_revision
{
my ($db, $revnum, $date) = @_;
assert(defined $revnum) if DEBUG;
$date = db_datetime($date);
croak "revision r$revnum has no datetime stamp, or it is invalid"
unless defined $date;
db_insert($db, 'revision',
revnum => $revnum,
committed_at => $date,
);
}
sub _adjust_custom_uri
{
my ($ra, $db, $path, $revnum, $guid) = @_;
assert($path ne '') if DEBUG;
my $full_path = "trunk/$path";
my $props = get_subversion_properties($ra, $full_path, $revnum);
return unless defined $props; # not present in trunk
if (exists $props->{'daizu:guid'}) {
my $new_uri = validate_uri($props->{'daizu:guid'});
croak "error in revision $revnum: invalid URI in 'daizu:guid' property on '$full_path'"
unless defined $new_uri;
$new_uri = $new_uri->canonical;
if ($guid->{custom_uri}) {
if ($guid->{uri} ne $new_uri) {
# There was a custom URI already, but it has been changed.
db_update($db, file_guid => $guid->{id}, uri => $new_uri);
}
}
else {
# The guid property has been added, so switch from the standard
# guid to the custom one.
$db->do(q{
update file_guid
set custom_uri = true,
old_uri = uri,
uri = ?
where id = ?
}, undef, $new_uri, $guid->{id});
}
}
elsif ($guid->{custom_uri}) {
# The guid property has been removed, so switch back to the
# original standard GUID.
$db->do(q{
update file_guid
set uri = old_uri,
old_uri = null,
custom_uri = false
where id = ?
}, undef, $guid->{id});
}
}
sub _revision_guid_modifications
{
my ($ra, $db, $trunk_id, $revnum, $modified) = @_;
for my $path (@$modified) {
assert($path ne '') if DEBUG;
my $guid = file_guid($db, $trunk_id, $path, $revnum);
croak "modified file 'trunk/$path' has no GUID in revision $revnum"
unless defined $guid;
db_update($db, file_guid => $guid->{id},
last_changed_revnum => $revnum);
_adjust_custom_uri($ra, $db, $path, $revnum, $guid);
}
}
sub _revision_guid_path_changes
{
my ($cms, $ra, $db, $trunk_id, $revnum, $added, $copied, $deleted) = @_;
# Record last revnum of deleted paths.
for my $del (@$deleted) {
my ($branch_id, $path) = @$del;
if ($path eq '') {
# If the top-level directory is deleted, that means delete
# everything on the branch.
db_update($db, file_path => { branch_id => $branch_id,
last_revnum => undef },
last_revnum => $revnum - 1,
);
}
else {
db_update($db, file_path => { branch_id => $branch_id,
path => $path,
last_revnum => undef },
last_revnum => $revnum - 1,
);
# If it's a directory, mark the demise of all its children.
$db->do(q{
update file_path
set last_revnum = ?
where branch_id = ?
and path like ?
and last_revnum is null
}, undef, $revnum - 1, $branch_id, like_escape($path) . '/%');
}
}
# Process copies sorted in reverse order, so that subdirectories are
# done before their parents. That way, when I process all the paths
# within a copied directory I can skip any which have already been
# processed separately, because for example the target subdirectory was
# copied from somewhere else.
my %source_guid;
my %added_path;
for (sort { $b->{path} cmp $a->{path} } @$copied) {
# If it's not the top-level directory, process the copy.
my $is_dir = 1;
if ($_->{from_path} ne '') {
my $guid = file_guid($db, $_->{from_branch_id}, $_->{from_path},
$_->{from_rev});
croak "Error in revision $revnum: file $_->{full_path} copied from source with no GUID ($_->{from_full_path} r$_->{from_rev})"
unless defined $guid;
push @{$source_guid{$_->{branch_id}}{$guid->{id}}}, $_;
undef $added_path{$_->{path}};
$is_dir = $guid->{is_dir};
}
# If the path being copied is a directory, then also copy all of its
# children from the same source.
if ($is_dir) {
my $branch_path =
db_select($db, branch => $_->{branch_id}, 'path');
my $from_branch_path =
db_select($db, branch => $_->{from_branch_id}, 'path');
my $sth = $db->prepare(q{
select path, guid_id
from file_path
where branch_id = ?
and path like ?
and first_revnum <= ?
and (last_revnum is null or last_revnum >= ?)
});
$sth->execute($_->{from_branch_id},
($_->{from_path} eq ''
? '%'
: like_escape($_->{from_path}) . '/%'),
$_->{from_rev}, $_->{from_rev});
my $prefix_len = length $_->{from_path};
++$prefix_len if $prefix_len; # separating /
while (my ($from_path, $guid_id) = $sth->fetchrow_array) {
my $from_full_path = "$from_branch_path/$from_path";
my $child_path = substr $from_path, $prefix_len;
my $path = $_->{path} eq '' ? $child_path
: "$_->{path}/$child_path";
next if exists $added_path{$path};
my $full_path = "$branch_path/$path";
push @{$source_guid{$_->{branch_id}}{$guid_id}}, {
full_path => $full_path,
path => $path,
branch_id => $_->{branch_id},
from_full_path => $from_full_path,
from_path => $from_path,
from_branch_id => $_->{from_branch_id},
from_rev => $_->{from_rev},
};
undef $added_path{$path};
}
}
}
while (my ($branch_id, $guids) = each %source_guid) {
while (my ($guid_id, $copies) = each %$guids) {
my @copies = sort { $a->{full_path} cmp $b->{full_path} } @$copies;
my $guid_already_present = db_row_exists($db, file_path =>
guid_id => $guid_id,
branch_id => $branch_id,
last_revnum => undef,
);
# If there isn't already a live path in the target branch for this
# GUID then one of the copies with it gets to keep it.
if (!$guid_already_present) {
my $keep = shift @copies;
db_insert($db, 'file_path',
guid_id => $guid_id,
path => $keep->{path},
branch_id => $keep->{branch_id},
first_revnum => $revnum,
);
if ($keep->{branch_id} == $trunk_id) {
my $guid = file_guid($db, $trunk_id, $keep->{path},
$revnum);
_adjust_custom_uri($ra, $db, $keep->{path}, $revnum, $guid);
db_update($db, file_guid => $guid->{id},
last_changed_revnum => $revnum);
}
}
# Copies which can't keep their GUID, because it's already live
# in the target branch, get treated just like adds without history.
for (@copies) {
push @$added, [ $_->{branch_id}, $_->{path}, $_->{full_path} ];
}
}
}
for my $add (@$added) {
my ($branch_id, $path, $full_path) = @$add;
next if $path eq '';
# First mint a new GUID for it.
my $is_dir = $ra->check_path($full_path, $revnum) == $SVN::Node::dir;
my ($guid_id, $guid_uri) = mint_guid($cms, $is_dir, $path, $revnum);
_adjust_custom_uri($ra, $db, $path, $revnum, {
uri => $guid_uri,
id => $guid_id,
is_dir => $is_dir,
});
db_insert($db, file_path =>
guid_id => $guid_id,
path => $path,
branch_id => $branch_id,
first_revnum => $revnum,
);
}
}
=back
=head1 COPYRIGHT
This software is copyright 2006 Geoff Richards E<lt>geoff@laxan.comE<gt>.
For licensing information see this page:
L<http://www.daizucms.org/license/>
=cut
1;
# vi:ts=4 sw=4 expandtab