kdesrc-build/modules/ksb/KDEXMLReader.pm

package ksb::KDEXMLReader;

# Class: KDEXMLReader
#
# kde_projects.xml module-handling code.
# The core of this was graciously contributed by Allen Winter, and then
# touched-up and kdesrc-build'ed by myself -mpyne.
# (By late 2015 this is mostly mpyne's fault -mpyne).

use strict;
use warnings;
use 5.014;

our $VERSION = '0.20';

use XML::Parser;

# Method: new
#
# Constructs a new KDEXMLReader. This doesn't contradict any part of the class
# documentation which claims this class is a singleton however. This should be
# called as a method (e.g. KDEXMLReader->new(...)).
#
# Parameters:
#  $inputHandle - Ref to filehandle to read from. Must implement _readline_ and
#  _eof_.
#  $desiredProtocol - Normally 'git', but other protocols like 'http' can also
#   be preferred (e.g. for proxy compliance).
sub new
{
    my $class = shift;
    my $inputHandle = shift;
    my $desiredProtocol = shift;

    my $self = {
        # Maps short names to repo info blocks
        repositories => { },
    };

    $self = bless ($self, $class);
    $self->_readProjectData($inputHandle, $desiredProtocol);

    return $self;
}

# XML tags which group repositories.
my %xmlGroupingIds = (
    component => 1,
    module    => 1,
    project   => 1,
);

# The 'main' method for this class. Reads in *all* KDE projects and notes
# their details for later queries.
# Be careful, can throw exceptions.
sub _readProjectData
{
    my ($self, $inputHandle, $desiredProtocol) = @_;

    $desiredProtocol //= '';
    my $auxRef = {
        # Used to assign full names to modules.
        nameStackRef    => [],
        # >0 if we are actually in a repo element.
        inRepo          => 0,
        # ref to hash table entry for current XML element.
        curRepository   => undef,
        desiredProtocol => $desiredProtocol,
        repositoryRef   => $self->{repositories},
    };

    my $parser = XML::Parser->new(
        Handlers =>
            {
                Start => sub { _xmlTagStart($auxRef, @_); },
                End   => sub { _xmlTagEnd  ($auxRef, @_); },
                Char  => sub { _xmlCharData($auxRef, @_); },
            },
    );

    # Will die if the XML is not well-formed.
    $parser->parse($inputHandle);
}

# Note on $proj: A /-separated path is fine, in which case we look
# for the right-most part of the full path which matches all of searchProject.
# e.g. kde/kdebase/kde-runtime would be matched by a proj of either
# "kdebase/kde-runtime" or simply "kde-runtime".
sub getModulesForProject
{
    my ($self, $proj) = @_;

    my $repositoryRef = $self->{repositories};
    my @results;
    my $findResults = sub {
        push @results, (
            grep {
                _projectPathMatchesWildcardSearch(
                    $repositoryRef->{$_}->{'fullName'}, $proj)
            } (keys %{$repositoryRef}));
    };

    # Wildcard matches happen as specified if asked for.
    # Non-wildcard matches have an implicit "$proj/*" search as well for
    # compatibility with previous use-modules
    # Project specifiers ending in .git are forced to be non-wildcarded.
    if ($proj !~ /\*/ && $proj !~ /\.git$/) {
        # We have to do a search to account for over-specified module names
        # like phonon/phonon
        $findResults->();

        # Now setup for a wildcard search to find things like kde/kdelibs/baloo
        # if just 'kdelibs' is asked for.
        $proj .= '/*';
    }

    $proj =~ s/\.git$//;

    # If still no wildcard and no '/' then we can use direct lookup by module
    # name.
    if ($proj !~ /\*/ && $proj !~ /\// && exists $repositoryRef->{$proj}) {
        push @results, $proj;
    }
    else {
        $findResults->();
    }

    return @{$repositoryRef}{@results};
}

sub _xmlTagStart
{
    my ($aux, $expat, $element, %attrs) = @_;

    my $nameStackRef = $aux->{nameStackRef};
    if (exists $xmlGroupingIds{$element}) {
        push @{$nameStackRef}, $attrs{'identifier'};
    }

    my $curRepository = $aux->{curRepository};
    my $inRepo = $aux->{inRepo};

    # This code used to check for direct descendants and filter them out.
    # Now there are better ways (kde-build-metadata/build-script-ignore and
    # the user can customize using ignore-modules), and this filter made it
    # more difficult to handle kde/kdelibs{,/nepomuk-{core,widgets}}, so leave
    # it out for now. See also bug 321667.
    if ($element eq 'repo')
    {
        # This flag is cleared by the <repo>-end handler, so this *should* be
        # logically impossible.
        die "We are already tracking a repository" if $inRepo > 0;

        $aux->{inRepo} = 1;
        my $name = ${$nameStackRef}[-1];
        $curRepository = {
            'fullName' => join('/', @{$nameStackRef}),
            'repo' => '',
            'name' => $name,
            'active' => 'false',
            'tarball' => '',
            'branch'   => '',
            'branches' => [ ],
            'branchtype' => '', # Either branch:stable or branch:trunk
        }; # Repo/Active/tarball to be added by char handler.

        $aux->{repositoryRef}->{$name} = $curRepository;
        $aux->{curRepository} = $curRepository;
    }

    # Currently we only pull data while under a <repo> tag, so bail early if
    # we're not doing this to simplify later logic.
    return unless $inRepo;

    my $desiredProtocol = $aux->{desiredProtocol};

    # Character data is integrated by the char handler. To avoid having it
    # dump all willy-nilly into our dict, we leave a flag for what the
    # resultant key should be.
    if ($element eq 'active') {
        $curRepository->{'needs'} = 'active';

        # Unset our default value since one is present in the XML
        $curRepository->{'active'} = '';
    }
    # For git repos we want to retain the repository data and any snapshot
    # tarballs available.
    elsif ($element eq 'url') {
        $curRepository->{'needs'} =
            #                     this proto       | needs this attr set
            $attrs{'protocol'} eq $desiredProtocol ? 'repo'    :
            $attrs{'protocol'} eq 'tarball'        ? 'tarball' : undef;
    }
    # i18n data gives us the defined stable and trunk branches.
    elsif ($element eq 'branch') {
        $curRepository->{'needs'} = 'branch';

        my $branchType = $attrs{'i18n'} // '';
        $curRepository->{'branchtype'} = "branch:$branchType" if $branchType;
    }
}

sub _xmlTagEnd
{
    my ($aux, $expat, $element) = @_;

    my $nameStackRef = $aux->{nameStackRef};
    if (exists $xmlGroupingIds{$element}) {
        pop @{$nameStackRef};
    }

    my $inRepo = $aux->{inRepo};
    my $curRepository = $aux->{curRepository};

    # If gathering data for char handler, stop now.
    if ($inRepo && defined $curRepository->{'needs'}) {
        delete $curRepository->{'needs'};
    }

    # Save all branches encountered, mark which ones are 'stable' and 'trunk'
    # for i18n purposes, as this keys into use-stable-kde handling.
    if ($element eq 'branch') {
        my $branch = $curRepository->{'branch'};
        push @{$curRepository->{'branches'}}, $branch;
        $curRepository->{'branch'} = '';

        my $branchType = $curRepository->{'branchtype'};
        $curRepository->{$branchType} = $branch if $branchType;
        $curRepository->{'branchtype'} = '';
    }

    if ($element eq 'repo' && $inRepo) {
        $aux->{inRepo} = 0;
        $aux->{curRepository} = undef;
    }
}

sub _xmlCharData
{
    my ($aux, $expat, $utf8Data) = @_;
    my $curRepository = $aux->{curRepository};

    # The XML::Parser manpage makes it clear that the char handler can be
    # called consecutive times with data for the same tag, so we use the
    # append operator and then clear our flag in _xmlTagEnd.
    if ($curRepository && defined $curRepository->{'needs'}) {
        $curRepository->{$curRepository->{'needs'}} .= $utf8Data;
    }
}

# Utility subroutine, returns true if the given kde-project full path (e.g.
# kde/kdelibs/nepomuk-core) matches the given search item.
#
# The search item itself is based on path-components. Each path component in
# the search item must be present in the equivalent path component in the
# module's project path for a match. A '*' in a path component position for the
# search item matches any project path component.
#
# Finally, the search is pinned to search for a common suffix. E.g. a search
# item of 'kdelibs' would match a project path of 'kde/kdelibs' but not
# 'kde/kdelibs/nepomuk-core'. However 'kdelibs/*' would match
# 'kde/kdelibs/nepomuk-core'.
#
# First parameter is the full project path from the kde-projects database.
# Second parameter is the search item.
# Returns true if they match, false otherwise.
sub _projectPathMatchesWildcardSearch
{
    my ($projectPath, $searchItem) = @_;

    my @searchParts = split(m{/}, $searchItem);
    my @nameStack   = split(m{/}, $projectPath);

    if (scalar @nameStack >= scalar @searchParts) {
        my $sizeDifference = scalar @nameStack - scalar @searchParts;

        # We might have to loop if we somehow find the wrong start point for our search.
        # E.g. looking for a/b/* against a/a/b/c, we'd need to start with the second a.
        my $i = 0;
        while ($i <= $sizeDifference) {
            # Find our common prefix, then ensure the remainder matches item-for-item.
            for (; $i <= $sizeDifference; $i++) {
                last if $nameStack[$i] eq $searchParts[0];
            }

            return if $i > $sizeDifference; # Not enough room to find it now

            # At this point we have synched up nameStack to searchParts, ensure they
            # match item-for-item.
            my $found = 1;
            for (my $j = 0; $found && ($j < @searchParts); $j++) {
                return 1   if $searchParts[$j] eq '*'; # This always works
                $found = 0 if $searchParts[$j] ne $nameStack[$i + $j];
            }

            return 1 if $found; # We matched every item to the substring we found.
            $i++; # Try again
        }
    }

    return;
}

1;