@@ -58,6 +58,11 @@ impl TranslatorBuilder {
5858 /// When disabled (the default), the translator is guaranteed to produce
5959 /// an expression that will only ever match valid UTF-8 (otherwise, the
6060 /// translator will return an error).
61+ ///
62+ /// Note that currently, even when invalid UTF-8 is banned, the translator
63+ /// will permit a negated ASCII word boundary (i.e., `(?-u:\B)`) even
64+ /// though it can actually match at invalid UTF-8 boundaries. This bug
65+ /// will be fixed on the next semver release.
6166 pub fn allow_invalid_utf8 (
6267 & mut self ,
6368 yes : bool ,
@@ -290,7 +295,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
290295 self . push ( HirFrame :: Expr ( try!( self . hir_dot ( span) ) ) ) ;
291296 }
292297 Ast :: Assertion ( ref x) => {
293- self . push ( HirFrame :: Expr ( self . hir_assertion ( x) ) ) ;
298+ self . push ( HirFrame :: Expr ( try! ( self . hir_assertion ( x) ) ) ) ;
294299 }
295300 Ast :: Class ( ast:: Class :: Perl ( ref x) ) => {
296301 if self . flags ( ) . unicode ( ) {
@@ -679,10 +684,10 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
679684 } )
680685 }
681686
682- fn hir_assertion ( & self , asst : & ast:: Assertion ) -> Hir {
687+ fn hir_assertion ( & self , asst : & ast:: Assertion ) -> Result < Hir > {
683688 let unicode = self . flags ( ) . unicode ( ) ;
684689 let multi_line = self . flags ( ) . multi_line ( ) ;
685- match asst. kind {
690+ Ok ( match asst. kind {
686691 ast:: AssertionKind :: StartLine => {
687692 Hir :: anchor ( if multi_line {
688693 hir:: Anchor :: StartLine
@@ -714,10 +719,20 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
714719 Hir :: word_boundary ( if unicode {
715720 hir:: WordBoundary :: UnicodeNegate
716721 } else {
722+ // It is possible for negated ASCII word boundaries to
723+ // match at invalid UTF-8 boundaries, even when searching
724+ // valid UTF-8.
725+ //
726+ // TODO(ag): Enable this error when regex goes to 1.0.
727+ // Otherwise, it is too steep of a breaking change.
728+ // if !self.trans().allow_invalid_utf8 {
729+ // return Err(self.error(
730+ // asst.span, ErrorKind::InvalidUtf8));
731+ // }
717732 hir:: WordBoundary :: AsciiNegate
718733 } )
719734 }
720- }
735+ } )
721736 }
722737
723738 fn hir_group ( & self , group : & ast:: Group , expr : Hir ) -> Hir {
@@ -1490,7 +1505,15 @@ mod tests {
14901505 assert_eq ! ( t( r"\b" ) , hir_word( hir:: WordBoundary :: Unicode ) ) ;
14911506 assert_eq ! ( t( r"\B" ) , hir_word( hir:: WordBoundary :: UnicodeNegate ) ) ;
14921507 assert_eq ! ( t( r"(?-u)\b" ) , hir_word( hir:: WordBoundary :: Ascii ) ) ;
1493- assert_eq ! ( t( r"(?-u)\B" ) , hir_word( hir:: WordBoundary :: AsciiNegate ) ) ;
1508+ assert_eq ! (
1509+ t_bytes( r"(?-u)\B" ) ,
1510+ hir_word( hir:: WordBoundary :: AsciiNegate ) ) ;
1511+
1512+ // TODO(ag): Enable this tests when regex goes to 1.0.
1513+ // assert_eq!(t_err(r"(?-u)\B"), TestError {
1514+ // kind: hir::ErrorKind::InvalidUtf8,
1515+ // span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)),
1516+ // });
14941517 }
14951518
14961519 #[ test]
@@ -2355,13 +2378,13 @@ mod tests {
23552378 assert ! ( t_bytes( r"[^a][^a]" ) . is_always_utf8( ) ) ;
23562379 assert ! ( t_bytes( r"\b" ) . is_always_utf8( ) ) ;
23572380 assert ! ( t_bytes( r"\B" ) . is_always_utf8( ) ) ;
2381+ assert ! ( t_bytes( r"(?-u)\b" ) . is_always_utf8( ) ) ;
23582382
23592383 // Negative examples.
23602384 assert ! ( !t_bytes( r"(?-u)\xFF" ) . is_always_utf8( ) ) ;
23612385 assert ! ( !t_bytes( r"(?-u)\xFF\xFF" ) . is_always_utf8( ) ) ;
23622386 assert ! ( !t_bytes( r"(?-u)[^a]" ) . is_always_utf8( ) ) ;
23632387 assert ! ( !t_bytes( r"(?-u)[^a][^a]" ) . is_always_utf8( ) ) ;
2364- assert ! ( !t_bytes( r"(?-u)\b" ) . is_always_utf8( ) ) ;
23652388 assert ! ( !t_bytes( r"(?-u)\B" ) . is_always_utf8( ) ) ;
23662389 }
23672390
@@ -2490,7 +2513,7 @@ mod tests {
24902513 assert ! ( t( r"\A" ) . is_match_empty( ) ) ;
24912514 assert ! ( t( r"\z" ) . is_match_empty( ) ) ;
24922515 assert ! ( t( r"\B" ) . is_match_empty( ) ) ;
2493- assert ! ( t ( r"(?-u)\B" ) . is_match_empty( ) ) ;
2516+ assert ! ( t_bytes ( r"(?-u)\B" ) . is_match_empty( ) ) ;
24942517
24952518 // Negative examples.
24962519 assert ! ( !t( r"a+" ) . is_match_empty( ) ) ;
0 commit comments