From c910e1ff1538dea15165bc853cf45427e21365f1 Mon Sep 17 00:00:00 2001
From: Duan Yao <duanyao@ustc.edu>
Date: Sat, 15 Nov 2014 01:15:44 +0800
Subject: [PATCH] Improve --space-as-offset: determine spaces by unicode

---
 src/HTMLRenderer/text.cc | 48 ++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/HTMLRenderer/text.cc b/src/HTMLRenderer/text.cc
index 5d5ecd92b..63ec94b8a 100644
--- a/src/HTMLRenderer/text.cc
+++ b/src/HTMLRenderer/text.cc
@@ -85,43 +85,43 @@ void HTMLRenderer::drawString(GfxState * state, GooString * s)
         tracer.draw_char(state, dx, dy, ax, ay);
 
         bool is_space = false;
-        if (n == 1 && *p == ' ') 
+        if (n == 1 && *p == ' ')
         {
             /*
-             * This is by standard
-             * however some PDF will use ' ' as a normal encoding slot
-             * such that it will be mapped to other unicodes
-             * In that case, when space_as_offset is on, we will simply ignore that character...
-             *
-             * Checking mapped unicode may or may not work
-             * There are always ugly PDF files with no useful info at all.
+             * is_space indicates an ASCII SPACE before decoding.
+             * This is by standard - only such characters are affected by "word space".
+             * However some PDF will use ' ' as a normal encoding slot,
+             * such that it will be mapped to other unicodes.
              */
             is_space = true;
         }
-        
-        if(is_space && (param.space_as_offset))
+
+        if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
         {
-            html_text_page.get_cur_line()->append_padding_char();
-            // ignore horiz_scaling, as it has been merged into CTM
-            html_text_page.get_cur_line()->append_offset((ax * cur_font_size + cur_letter_space + cur_word_space) * draw_text_scale);
+            html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx);
         }
         else
         {
-            if((param.decompose_ligature) && (uLen > 1) && all_of(u, u+uLen, isLegalUnicode))
+            Unicode uu;
+            if(cur_text_state.font_info->use_tounicode)
             {
-                html_text_page.get_cur_line()->append_unicodes(u, uLen, ddx);
+                uu = check_unicode(u, uLen, code, font);
+            }
+            else
+            {
+                uu = unicode_from_font(code, font);
+            }
+            // Mapping to unicode may be wrong in some PDFs, and uu == ' ' is actually a visible character.
+            // In that case, when space_as_offset is on, we will simply ignore that character.
+            if((param.space_as_offset) && (uu == ' '))
+            {
+                html_text_page.get_cur_line()->append_padding_char();
+                // ignore horiz_scaling, as it has been merged into CTM
+                double offset = ax * cur_font_size + cur_letter_space + (is_space ? cur_word_space : 0);
+                html_text_page.get_cur_line()->append_offset(offset * draw_text_scale);
             }
             else
             {
-                Unicode uu;
-                if(cur_text_state.font_info->use_tounicode)
-                {
-                    uu = check_unicode(u, uLen, code, font);
-                }
-                else
-                {
-                    uu = unicode_from_font(code, font);
-                }
                 html_text_page.get_cur_line()->append_unicodes(&uu, 1, ddx);
                 /*
                  * In PDF, word_space is appended if (n == 1 and *p = ' ')