package DataStore::CAS::FS::InvalidUTF8; use strict; use warnings; use Carp; use overload '""' => \&to_string, 'cmp' => \&str_compare, '.' => \&str_concat; our $VERSION= '0.011000'; # ABSTRACT: Wrapper to represent non-utf8 data in a unicode context sub decode_utf8 { my $str= $_[-1]; !ref $str || ref($str)->isa(__PACKAGE__) or croak "Can't convert ".ref($str); return ref($str) || utf8::is_utf8($str) || utf8::decode($str)? $str : bless(\$str, __PACKAGE__); } sub is_non_unicode { 1 } sub to_string { ${$_[0]} } sub str_compare { my ($self, $other, $swap)= @_; if (ref $other eq __PACKAGE__) { $other= $$other } else { utf8::encode($other) } my $ret= $$self cmp $other; return $swap? -$ret : $ret; } sub str_concat { my ($self, $other, $swap)= @_; if (ref $other eq __PACKAGE__) { $other= $$other } else { utf8::encode($other) } return ref($self)->decode_utf8($swap? $other.$$self : $$self.$other); } sub add_json_filter { my ($self, $json)= @_; $json->filter_json_single_key_object( '*InvalidUTF8*' => \&FROM_JSON ); $json; } sub TO_JSON { my $x= ${$_[0]}; utf8::upgrade($x); return { '*InvalidUTF8*' => $x }; } sub FROM_JSON { my $x= $_[0]; utf8::downgrade($x) if utf8::is_utf8($x); return bless \$x, __PACKAGE__; } sub TO_UTF8 { ${$_[0]}; } 1; __END__ =pod =head1 NAME DataStore::CAS::FS::InvalidUTF8 - Wrapper to represent non-utf8 data in a unicode context =head1 VERSION version 0.011000 =head1 SYNOPSIS my $j= JSON->new()->convert_blessed; DataStore::CAS::FS::InvalidUTF8->add_json_filter($j); my $x= DataStore::CAS::FS::InvalidUTF8->decode_utf8("\x{FF}"); my $json= $j->encode($x); my $x2= "".$j->decode($json); is( $x, $x2 ); ok( !utf8::is_utf8($x2) ); =head1 DESCRIPTION Much like using 'i' (or j) as the square root of -1, InvalidUTF8 allows a value which should have been utf-8, but isn't, to exist alongside the others. Combining InvalidUTF8 parts to make a valid utf-8 string will automatically decode the utf-8 into the resulting unicode string. Comparing InvalidUTF8 with a regular perl string will first convert the string to a UTF-8 representation, and then do a byte-wise comparison. InvalidUTF8 can also safely pass through JSON, if the filter is added to the JSON decoder, and "allow_blessed" is set on the encoder. =head1 METHODS =head2 decode_utf8 $string_or_ref= $class->decode_utf8( $byte_str ) If the $byte_str is valid UTF-8, this method returns the decoded perl unicode string. If not, it returns the string wrapped in an instance of InvalidUTF8. =head2 is_non_unicode This method returns true, and can be used in tests like if ($_->can('is_non_unicode')) { ... } as a way of detecting InvalidUTF8 objects by API rather than class hierarchy. =head2 to_string, '""' operator Returns the original string. =head2 str_compare, 'cmp' operator Converts peer to utf-8 bytes, then compares the bytes. =head2 str_concat, '.' operator Converts the peer to utf-8 bytes, concatenates the bytes, and then re-evaluates whether the result needs to be wrapped in an instance of InvalidUTF8. =head2 add_json_filter $json_instance= $class->add_json_filter($json_instance); Applies a filter to the JSON object so that when it encounters { "*InvalidUTF8*": "$string" } it will inflate the string using the FROM_JSON method. =head2 TO_JSON Called by the JSON module when convert_blessed is enabled. Returns { "*InvalidUTF8*" => $original_str } which can be converted back to a InvalidUTF8 object during decode_json, if the filter is applied. =head2 FROM_JSON Pass this function to JSON's L with a key of C<*InvalidUTF8*> to restore the objects that were serialized. It takes care of calling L to undo the JSON module's unicode conversion. =head1 AUTHOR Michael Conrad =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2013 by Michael Conrad, and IntelliTree Solutions llc. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut