package DataStore::CAS::FS::DirCodec::Unix; use 5.008; use strict; use warnings; use Try::Tiny; use Carp; use JSON 2.53 (); use Scalar::Util 'looks_like_number'; require DataStore::CAS::FS::Dir; require DataStore::CAS::FS::DirEnt; require DataStore::CAS::FS::InvalidUTF8; *decode_utf8= *DataStore::CAS::FS::InvalidUTF8::decode_utf8; use parent 'DataStore::CAS::FS::DirCodec'; our $VERSION= '0.011000'; __PACKAGE__->register_format(unix => __PACKAGE__); # ABSTRACT: Efficiently encode only the attributes of a UNIX stat() our $_json_coder; sub _build_json_coder { DataStore::CAS::FS::InvalidUTF8->add_json_filter( JSON->new->utf8->canonical->convert_blessed, 1 ); } our %_TypeToCode= ( file => ord('f'), dir => ord('d'), symlink => ord('l'), chardev => ord('c'), blockdev => ord('b'), pipe => ord('p'), socket => ord('s'), whiteout => ord('w'), ); our %_CodeToType= map { $_TypeToCode{$_} => $_ } keys %_TypeToCode; our @_FieldOrder= qw( type name ref size modify_ts unix_uid unix_gid unix_mode metadata_ts access_ts unix_nlink unix_dev unix_inode unix_blocksize unix_blockcount ); sub encode { my ($class, $entry_list, $metadata)= @_; $metadata= defined($metadata)? { %$metadata } : {}; defined $metadata->{_} and croak '$metadata{_} is reserved for the directory encoder'; my (%umap, %gmap); my @entries= map { my $e= ref $_ eq 'HASH'? $_ : $_->as_hash; defined $e->{type} or croak "'type' attribute is required"; my $code= $_TypeToCode{$e->{type}} or croak "Unknown directory entry type: ".$e->{type}; my $name= $e->{name}; defined $name or croak "'name' attribute is required"; _make_utf8($name) or croak "'name' must be a unicode scalar or an InvalidUTF8 instance"; my $ref= $e->{ref}; $ref= '' unless defined $ref; _make_utf8($ref) or croak "'ref' must be a unicode scalar or an InvalidUTF8 instance"; $umap{$e->{unix_uid}}= $e->{unix_user} if defined $e->{unix_uid} && defined $e->{unix_user}; $gmap{$e->{unix_gid}}= $e->{unix_group} if defined $e->{unix_gid} && defined $e->{unix_group}; my $int_attr_str= join(":", map { !defined $_? '' : looks_like_number($_)? $_ : croak "Invalid unix attribute number: $_" } @{$e}{@_FieldOrder[3..$#_FieldOrder]} ); # As an optimization, all undef trailing fields can be chopped off. $int_attr_str =~ s/:+$//; croak "'name' too long: '$name'" if length($name) > 255; croak "'ref' too long: '$ref'" if length($ref) > 255; croak "Unix fields too long: '$int_attr_str'" if length($int_attr_str) > 255; pack('CCCC', length($name), length($ref), length($int_attr_str), $code).$name."\0".$ref."\0".$int_attr_str; } @$entry_list; # Save the mapping of UID to User and GID to Group $metadata->{_}{umap}= \%umap; $metadata->{_}{gmap}= \%gmap; my $meta_json= ($_json_coder ||= _build_json_coder())->encode($metadata); my $ret= "CAS_Dir 04 unix\n" .pack('N', length($meta_json)).$meta_json .join('', sort { substr($a,4) cmp substr($b,4) } @entries); croak "Accidental unicode concatenation" if utf8::is_utf8($ret); $ret; } # Convert string in-place to utf-8 bytes, or return false. # A less speed-obfuscated version might read: # my $str= shift; # if (ref $str) { # return 0 unless ref($str)->can('TO_UTF8'); # $_[0]= $str->TO_UTF8; # return 1; # } elsif (utf8::is_utf8($str)) { # utf8::encode($_[0]); # return 1; # } else { # return !($_[0] =~ /[\x7F-\xFF]/); # } sub _make_utf8 { ref $_[0]? (ref($_[0])->can('TO_UTF8') && (($_[0]= $_[0]->TO_UTF8) || 1)) : &utf8::is_utf8 && (&utf8::encode || 1) || !($_[0] =~ /[\x80-\xFF]/); } sub decode { my ($class, $params)= @_; $params->{format}= $class->_read_format($params) unless defined $params->{format}; my $handle= $params->{handle}; if (!$handle) { if (defined $params->{data}) { open($handle, '<', \$params->{data}) or croak "can't open handle to scalar"; } else { $handle= $params->{file}->open; } } my $header_len= $class->_calc_header_length($params->{format}); seek($handle, $header_len, 0) or croak "seek: $!"; my (@entries, $buf, $pos); # first, pull out the metadata, which includes the UID map and GID map. $class->_readall($handle, $buf, 4); my ($dirmeta_len)= unpack('N', $buf); $class->_readall($handle, my $json, $dirmeta_len); my $meta= ($_json_coder ||= _build_json_coder())->decode($json); # Quick sanity checks ref $meta->{_}{umap} and ref $meta->{_}{gmap} or croak "Incorrect directory metadata"; my $dirmeta= delete $meta->{_}; while (!eof $handle) { $class->_readall($handle, $buf, 4); my ($name_len, $ref_len, $meta_len, $code)= unpack('CCCC', $buf); $class->_readall($handle, $buf, $name_len+$ref_len+$meta_len+2); my @fields= ( $dirmeta, $code, DataStore::CAS::FS::InvalidUTF8->decode_utf8(substr($buf, 0, $name_len)), $ref_len? DataStore::CAS::FS::InvalidUTF8->decode_utf8(substr($buf, $name_len+1, $ref_len)) : undef, map { length($_)? $_ : undef } split(":", substr($buf, $name_len+$ref_len+2, $meta_len)), ); push @entries, bless(\@fields, __PACKAGE__.'::Entry'); } close $handle; return DataStore::CAS::FS::Dir->new( file => $params->{file}, format => $params->{format}, metadata => $meta, entries => \@entries, ); } package DataStore::CAS::FS::DirCodec::Unix::Entry; use strict; use warnings; use parent 'DataStore::CAS::FS::DirEnt'; sub _dirmeta { $_[0][0] } sub type { $_CodeToType{$_[0][1]} } sub name { $_[0][2] } sub ref { $_[0][3] } sub size { $_[0][4] } sub modify_ts { $_[0][5] } sub unix_uid { $_[0][6] } sub unix_gid { $_[0][7] } sub unix_mode { $_[0][8] } sub metadata_ts { $_[0][9] } sub access_ts { $_[0][10] } sub unix_nlink { $_[0][11] } sub unix_dev { $_[0][12] } sub unix_inode { $_[0][13] } sub unix_blocksize { $_[0][14] } sub unix_blockcount { $_[0][15] } *unix_mtime= *modify_ts; *unix_atime= *access_ts; *unix_ctime= *metadata_ts; sub unix_user { my $self= shift; $self->_dirmeta->{umap}{ $self->unix_uid } } sub unix_group { my $self= shift; $self->_dirmeta->{gmap}{ $self->unix_gid } } sub as_hash { my $self= shift; return { type => $self->type, map { $_FieldOrder[$_-1] => $self->[$_] } grep { defined $self->[$_] } 2 .. $#$self }; } 1; __END__ =pod =head1 NAME DataStore::CAS::FS::DirCodec::Unix - Efficiently encode only the attributes of a UNIX stat() =head1 VERSION version 0.011000 =head1 DESCRIPTION This directory encoder/decoder encodes only the fields of a L corresponding to a unix stat_t structure. (or more precisely, the fields perl returns from the stat function) Any other fields in the L are ignored. It does this much more efficiently than would be done in L, but still uses text, to avoid complications of endian-ness and word size. (and because 32-bit perl can't numerically process 64-bit integers) The encoding is further optimized by ordering the fields by likelyhood of being used, and truncating records at the last used field. It also imposes some restrictions: 'name' and 'ref' must each be less than 256 bytes when encoded as UTF-8. There is also a limitation on the unix stat values, but they will all fit even with max-length 64 bit integers, so this shouldn't ever be a problem. =head1 METHODS =head2 encode $serialized= $class->encode( \@entries, \%metadata ) See Lencode|DataStore::CAS::FS::DirCodec/encode> for details. =head2 decode my $dir= $class->decode( \%params ) See Lload|DataStore::CAS::FS::DirCodec/load> for details on C<%params>. =head1 AUTHOR Michael Conrad =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2013 by Michael Conrad, and IntelliTree Solutions llc. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut