package Chemistry::File::SMARTS; $VERSION = "0.22"; # $Id: SMARTS.pm,v 1.8 2005/05/16 22:20:24 itubert Exp $ use 5.006; use strict; use warnings; use Chemistry::Pattern; use base "Chemistry::File"; use Carp; #use Data::Dumper; use List::Util 'sum', 'first'; =head1 NAME Chemistry::File::SMARTS - SMARTS chemical substructure pattern linear notation parser =head1 SYNOPSYS #!/usr/bin/perl use Chemistry::File::SMARTS; # this string matches an oxygen next to an atom with three # neighbors, one of which is a hydrogen, and a positive charge my $smarts = 'O[D3H+]'; # parse a SMARTS string and compile it into a # Chemistry::Pattern object my $patt = Chemistry::Pattern->parse("$smarts", format => 'smarts'); # find matches of the pattern in a Chemistry::Mol object $mol my $mol = Chemistry::Mol->read("myfile.mol"); while ($patt->match($mol)) { print "pattern matches atoms: ", $patt->atom_map, "\n" } # NOTE: if the SMARTS pattern relies on aromaticity or ring # properties, you have to make sure that the target # molecule is "aromatized" first: my $smarts = 'c:a'; my $patt = Chemistry::Pattern->parse("$smarts", format => 'smarts'); use Chemistry::Ring 'aromatize_mol'; aromatize_mol($mol); # <--- AROMATIZE!!! while ($patt->match($mol)) { print "pattern matches atoms: ", $patt->atom_map, "\n" } # Note that "atom mapping numbers" end up as $atom->name my $patt = Chemistry::Pattern->parse("[C:7][C:8]", format => 'smarts'); print $patt->atoms(1)->name; # prints 7 =head1 DESCRIPTION This module parse a SMARTS (SMiles ARbitrary Target Specification) string, generating a L object. It is a file I/O driver for the PerlMol toolkit; it's not called directly but by means of the Chemistry::Pattern->parse class method. For a detailed description of the SMARTS language, see L. Note that this module doesn't implement the full language, as detailed under CAVEATS. This module is part of the PerlMol project, L. =cut # Initialization Chemistry::Mol->register_format(smarts => __PACKAGE__); our $DEBUG = 0; # Chemistry::File interface sub parse_string { my ($self, $s, %opts) = @_; my $patt = parse_smarts($s, \%opts); $patt->name($s); $patt; } ############## SMARTS PARSER ############## sub parse_smarts { my ($s, $opts) = @_; my %digits; my $toks = tokenize($s); my $tok = shift(@$toks); print "tok: $tok\n" if $DEBUG; my $mol_class = $opts->{mol_class} || "Chemistry::Pattern"; my $patt = $mol_class->new(); $patt->options($opts->{pattern_options}||{}); my @atom_stack; my $current_atom = parse_atom($patt, $tok, $toks); while (defined ($tok = shift @$toks)) { print "tok: $tok\n" if $DEBUG; if ($tok eq '(') { push @atom_stack, $current_atom; } elsif ($tok eq ')') { $current_atom = pop @atom_stack; } else { # bond, atom my $next_tok = shift @$toks; if ($next_tok =~ /^\d+$/) { # digit if ($digits{$next_tok}) { # close ring parse_bond($patt, $tok, $current_atom, $digits{$next_tok}); $digits{$next_tok} = undef; } else { # open ring $digits{$next_tok} = $current_atom; } } else { my $next_atom = parse_atom($patt, $next_tok, $toks); parse_bond($patt, $tok, $current_atom, $next_atom); $current_atom = $next_atom; } } } $patt; } sub parse_atom { my ($patt, $s, $toks) = @_; my $n_rec = 0; my $name; if ($s =~ s/:(\d+)$//) { $name = $1; } my $expr = join " and ", map { join " || ", map { join ' && ', map { parse_atomic_primitive($_, \$n_rec); } split '&', $_; } split ',', $_; } split ';', $s; my @recs; for (1 .. $n_rec) { my $rec_smarts = shift @$toks; my $rec = Chemistry::Pattern->parse($rec_smarts, pattern_options => {overlap=>0, permute=>0}, format => 'smarts', ); push @recs, $rec; } print "atom expr: $expr\n" if $DEBUG; my $sub = eval <atom_class->new( test_sub => $sub, name => $name, ); $patt->add_atom($atom); $atom; } # missing primitives: @, @@ sub parse_atomic_primitive { my ($s, $n_rec) = @_; local $_ = $s; my @terms; no warnings 'uninitialized'; s/^(!?)H// && # Hydrogen atom push @terms, "$1(\$atom->symbol eq 'H')"; s/(!?) (Zr|Zn|Yb|Y|Xe|W|V|U|Tm|Tl|Ti|Th| Te|Tc|Tb|Ta|Sr|Sn|Sm|Si|Sg|Se|Sc|Sb|S|Ru|Rn|Rh|Rf|Re|Rb|Ra| Pu|Pt|Pr|Po|Pm|Pd|Pb|Pa|P|Os|O|Np|No|Ni|Ne|Nd|Nb|Na|N|Mt|Mt| Mo|Mn|Mg|Md|Lu|Lr|Li|La|Kr|K|Ir|In|I|Hs|Hs|Ho|Hg|Hf|He|Ge| Gd|Ga|Fr|Fm|Fe|F|Eu|Es|Er|Dy|Ds|Db|Cu|Cs|Cr|Co|Cm|Cl|Cf|Ce| Cd|Ca|C|Br|Bk|Bi|Bh|Be|Ba|B|Au|At|As|Ar|Am|Al|Ag|Ac) //x # Order is reverse alphabetical to ensure longest match && push @terms, "$1(\$atom->symbol eq '$2' && ! \$atom->aromatic)"; s/(!?)\*// && # wildcard push @terms, "${1}1"; s/(!?)D(\d?)// && # explicit connections (shouldn't count h) push @terms, "$1(\$atom->bonds == " . (length $2 ? $2 : 1) . ')'; s/(!?)a// && # aromatic push @terms, "$1\$atom->aromatic"; s/(!?)A// && # aliphatic push @terms, "$1(!\$atom->aromatic)"; s/(!?)X(\d?)// && # total connections (should add implicit H) push @terms, "$1(\$atom->bonds + \$atom->hydrogens == " . (length $2 ? $2 : 1) . ')'; s/(!?)v(\d?)// && # valence push @terms, "$1(\$atom->valence == " . (length $2 ? $2 : 1) . ')'; s/(!?)H(\d?)// && # total H-count push @terms, "$1(sum(map {\$_->symbol eq 'H'} \$atom->neighbors) + \$atom->hydrogens == " . (length $2 ? $2 : 1) . ')'; s/(!?)h(\d?)// && # implicit H-count push @terms, "$1(\$atom->hydrogens == " . (length $2 ? $2 : 1) . ')'; s/(!?)R(\d?)// && # number of rings push @terms, "$1(\@{\$atom->attr('ring/rings')||[]} " . (length $2 ? "== $2" : "") . ')'; s/(!?)r(\d?)// && # ring size push @terms, "$1(first { " . (length $2 ? "\$_->atoms == $2" : "1") ." } \@{\$atom->attr('ring/rings')||[]} )"; s/(!?)#(\d+)// && # atomic number push @terms, "$1(\$atom->Z == $2)"; s/(!?)([+-]\d+)// && # numerical charge push @terms, "$1(\$atom->formal_charge == $2)"; s/(!?)(\++)// && # positive charge push @terms, "$1(\$atom->formal_charge == " . length($2) . ')'; s/(!?)(-+)// && # negative charge push @terms, "$1(\$atom->formal_charge == -" . length($2) . ')'; s/(!?)(\d+)// && # mass push @terms, "$1(\$atom->mass == $2)"; s/(!?)([cnosp])// && # aromatic symbol push @terms, "$1(\$atom->symbol eq '@{[uc $2]}' && \$atom->aromatic)"; #s/(!?)([A-Z][a-z]?)// && # aliphatic symbol #push @terms, "$1(\$atom->symbol eq '$2' && ! \$atom->aromatic)"; while (s/(!?)\$//) { # recursive SMARTS push @terms, qq{$1(\$recs[$$n_rec]->match(\$atom->parent,atom=>\$atom))}; $$n_rec++; } join ' && ', @terms; } sub parse_bond { my ($patt, $s, @atoms) = @_; return if $s eq '.'; # the disconnected non-bond my $expr; if ($s) { $expr = join " and ", map { join " || ", map { join ' && ', map { parse_bond_primitive($_); } split '&', $_; } split ',', $_; } split ';', $s; } else { $expr = '($bond->order == 1 || $bond->aromatic)'; } print "bond expr: $expr\n" if $DEBUG; my $sub = eval <bond_class->new(test_sub => $sub, atoms => \@atoms); $patt->add_bond($bond); $bond; } sub parse_bond_primitive { local ($_) = @_; my @terms; s/(!?)~// && # wildcard push @terms, "${1}1"; s/(!?)-// && # single push @terms, "$1(\$bond->order == 1 && !\$bond->aromatic)"; s/(!?)=// && # double push @terms, "$1(\$bond->order == 2)"; s/(!?)#// && # triple push @terms, "$1(\$bond->order == 3)"; s/(!?):// && # triple push @terms, "$1\$bond->aromatic"; s/(!?)\@// && # ring bond push @terms, "$1(\@{\$bond->attr('ring/rings')||[]} )"; join ' && ', @terms; } my %ORGANIC_ELEMS = ( Br => 1, Cl => 1, B => 1, C => 1, N => 1, O => 1, P => 1, S => 1, F => 1, I => 1, s => 1, p => 1, o => 1, n => 1, c => 1, b => 1, ); sub tokenize { my ($s) = @_; my $state = 3; my $paren_depth = 0; my $atom; my $digit; my $rec_smart; my @rec_smarts; my $bond = ''; my $symbol; my @toks; # 0 expects atom or branch # after atom: atom or bond or branch # after bond: atom or branch or /,;/ # token types: atom, bond, (, ) my @chars = split '', $s; my $char; while (defined ($char = shift @chars)) { print "char: $char\n" if $DEBUG; if ($state == 0) { # expect atom or branch (not used!) push(@toks, $char), $state = 2, next if ($char =~ /[()]/); $state = 3, redo; } elsif ($state == 1) { # in complex atom if ($char eq ']') { push @toks, $atom, @rec_smarts; @rec_smarts = (); $state = 4, next; } $atom .= $char; $state = 5 if ($char eq '$'); # recursive smarts next; } elsif ($state == 2) { # expect bond if ($char =~ /[-=:~\\\/#@?,;&.!]/) { $bond .= $char; next; } else { push @toks, $bond; $bond = ''; $state = 3, redo; } } elsif ($state == 3) { # expect atom $state = 1, $atom = '', next if $char eq '['; if ($char eq '%') { $state = 7, next; } $symbol = $char; push(@toks, $symbol), last unless @chars; $char = shift @chars; if ($ORGANIC_ELEMS{$symbol.$char}) { push @toks, $symbol.$char; $state = 4, next; } else { push @toks, $symbol; $state = 4, redo; } } elsif ($state == 4) { # expect atom or bond or branch push(@toks, $char), next if ($char =~ /[()]/); # branch $state = 2, redo; # bond } elsif ($state == 5) { # expect left paren croak "expected (" unless $char eq '('; $rec_smart = ''; $paren_depth++, $state = 6, next; } elsif ($state == 6) { # within recursive smarts #croak "Recursive SMARTS not implemented yet\n"; $paren_depth++ if $char eq '('; $paren_depth-- if $char eq ')'; unless ($paren_depth) { push @rec_smarts, $rec_smart; $state = 1, next; } $rec_smart .= $char; } elsif ($state == 7) { # double digit $digit = $char . (shift @chars || die "expected second digit"); push @toks, $digit; $state = 2; } else { die "shouldn't be here"; } } #print Dumper \@toks if $DEBUG; \@toks; } 1; =head1 CAVEATS The following features are not implemented yet: =over =item chirality: @, @@ =item component-level gruouping That is, the difference between these three cases: (SMARTS) (SMARTS).(SMARTS) (SMARTS).SMARTS =back The so-called parser is very lenient, so if you give it something that's not quite reasonable it will ignore it or interpret it in a strange way without warning. As shown in the synopsis, you have to make sure that the molecule is "aromatized" if you want to apply to it a pattern that relies on aromaticity or ring properties. =head1 VERSION 0.22 =head1 SEE ALSO L, L, L, L. For more information about SMARTS, see the SMARTS Theory Manual at L =head1 AUTHOR Ivan Tubert-Brohman Eitub@cpan.orgE =head1 COPYRIGHT Copyright (c) 2005 Ivan Tubert-Brohman. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut