From 32c866feb04eb81fcad259aaab61e4e0070bf8ef Mon Sep 17 00:00:00 2001
From: ulatekh <ulatekh@yahoo.com>
Date: Sun, 15 Jul 2018 09:10:51 -0700
Subject: [PATCH 3/3] Emit more font information when pdftohtml is run with
 -xml.

This extra information makes it easier to infer the text's meaning.
---
 utils/HtmlFonts.cc | 100 +++++++++++++++++++++++++++++++++++++++++++++++++----
 utils/HtmlFonts.h  |   6 ++++
 2 files changed, 100 insertions(+), 6 deletions(-)

diff --git a/utils/HtmlFonts.cc b/utils/HtmlFonts.cc
index 4c65d5f5..94986c37 100644
--- a/utils/HtmlFonts.cc
+++ b/utils/HtmlFonts.cc
@@ -125,6 +125,43 @@ HtmlFont::HtmlFont(GfxFont *font, int _size, GfxRGB rgb){
     fontname = nullptr;
     FontName = nullptr;
   }
+
+  // Try to get more information about this font, for the XML dump.
+  GooString *familyName = font->getFamily();
+  this->FamilyName = (familyName != nullptr) ? new GooString(familyName) : nullptr;
+  this->FontID = *font->getID();
+  GooString *embeddedFontName = font->getEmbeddedFontName();
+  this->EmbeddedFontName = (embeddedFontName != nullptr) ? new GooString(embeddedFontName) : nullptr;
+  // NOTE: Removing the suffixes will make the emitted <fontspec>
+  // values the same, but there will still be duplicate-looking entries.
+  // If they're merged now, the emitted HTML won't have <b> and <i>
+  // elements in mixed-style text-lines.
+  // So those will have to be detected and merged at post-emit
+  // analysis time.
+  char const *apszSuffixes[] = { "-Italic", "-Bold", "-Regular",
+    "-Curves", "-Corners" };
+  // (Remove the style from the embedded font name.)
+  for(char const *pszSuffix : apszSuffixes)
+  {
+     if (this->FontName->endsWith(pszSuffix))
+     {
+      int iLength = this->FontName->getLength();
+      int iSuffixLength = strlen(pszSuffix);
+      this->FontName->del(iLength - iSuffixLength,
+        iSuffixLength);
+     }
+  }
+  // (Remove the style from the embedded font name.)
+  for(char const *pszSuffix : apszSuffixes)
+  {
+    if (this->EmbeddedFontName->endsWith(pszSuffix))
+    {
+      int iLength = this->EmbeddedFontName->getLength();
+      int iSuffixLength = strlen(pszSuffix);
+      this->EmbeddedFontName->del(iLength - iSuffixLength,
+        iSuffixLength);
+    }
+  }
   
   lineSize = -1;
 
@@ -169,6 +206,9 @@ HtmlFont::HtmlFont(const HtmlFont& x){
    pos=x.pos;
    color=x.color;
    FontName = (x.FontName) ? new GooString(x.FontName) : nullptr;
+   FamilyName = (x.FamilyName) ? new GooString(x.FamilyName) : nullptr;
+   this->FontID = x.FontID;
+   EmbeddedFontName = (x.EmbeddedFontName) ? new GooString(x.EmbeddedFontName) : nullptr;
    rotOrSkewed = x.rotOrSkewed;
    memcpy(rotSkewMat, x.rotSkewMat, sizeof(rotSkewMat));
  }
@@ -176,6 +216,8 @@ HtmlFont::HtmlFont(const HtmlFont& x){
 
 HtmlFont::~HtmlFont(){
   if (FontName) delete FontName;
+  if (FamilyName) delete FamilyName;
+  if (EmbeddedFontName) delete EmbeddedFontName;
 }
 
 HtmlFont& HtmlFont::operator=(const HtmlFont& x){
@@ -188,6 +230,11 @@ HtmlFont& HtmlFont::operator=(const HtmlFont& x){
    color=x.color;
    if (FontName) delete FontName;
    FontName = (x.FontName) ? new GooString(x.FontName) : nullptr;
+   if (FamilyName) delete FamilyName;
+   FamilyName = (x.FamilyName) ? new GooString(x.FamilyName) : nullptr;
+   this->FontID = x.FontID;
+   if (EmbeddedFontName) delete EmbeddedFontName;
+   EmbeddedFontName = (x.EmbeddedFontName) ? new GooString(x.EmbeddedFontName) : nullptr;
    return *this;
 }
 
@@ -204,10 +251,16 @@ void HtmlFont::clear(){
 */
 GBool HtmlFont::isEqual(const HtmlFont& x) const{
   return (size==x.size) &&
-	  (lineSize==x.lineSize) &&
-	  (pos==x.pos) && (bold==x.bold) && (italic==x.italic) &&
-	  (color.isEqual(x.getColor())) && isRotOrSkewed() == x.isRotOrSkewed() &&
-	  (!isRotOrSkewed() || rot_matrices_equal(getRotMat(), x.getRotMat()));
+    (lineSize==x.lineSize) &&
+    ((FontName == nullptr && x.FontName == nullptr)
+      || (FontName != nullptr && x.FontName != nullptr && FontName->cmp(x.FontName) == 0)) &&
+    ((FamilyName == nullptr && x.FamilyName == nullptr)
+      || (FamilyName != nullptr && x.FamilyName != nullptr && FamilyName->cmp(x.FamilyName) == 0)) &&
+    ((EmbeddedFontName == nullptr && x.EmbeddedFontName == nullptr)
+      || (EmbeddedFontName != nullptr && x.EmbeddedFontName != nullptr && EmbeddedFontName->cmp(x.EmbeddedFontName) == 0)) &&
+      (pos==x.pos) && (bold==x.bold) && (italic==x.italic) &&
+    (color.isEqual(x.getColor())) && isRotOrSkewed() == x.isRotOrSkewed() &&
+    (!isRotOrSkewed() || rot_matrices_equal(getRotMat(), x.getRotMat()));
 }
 
 /*
@@ -216,8 +269,10 @@ GBool HtmlFont::isEqual(const HtmlFont& x) const{
 */
 GBool HtmlFont::isEqualIgnoreBold(const HtmlFont& x) const{
   return ((size==x.size) &&
-	  (!strcmp(fonts[pos].name, fonts[x.pos].name)) &&
-	  (color.isEqual(x.getColor())));
+    ((FontName == nullptr && x.FontName == nullptr)
+        || (FontName != nullptr && x.FontName != nullptr && FontName->cmp(x.FontName) == 0)) &&
+    (!strcmp(fonts[pos].name, fonts[x.pos].name)) &&
+    (color.isEqual(x.getColor())));
 }
 
 GooString* HtmlFont::getFontName(){
@@ -225,6 +280,21 @@ GooString* HtmlFont::getFontName(){
     else return new GooString(DefaultFont);
 }
 
+GooString* HtmlFont::getFamilyName() const
+{
+  return FamilyName;
+}
+
+Ref HtmlFont::getFontID() const
+{
+  return FontID;
+}
+
+GooString* HtmlFont::getEmbeddedFontName() const
+{
+  return EmbeddedFontName;
+}
+
 GooString* HtmlFont::getFullName(){
   if (FontName)
     return new GooString(FontName);
@@ -330,6 +400,8 @@ GooString* HtmlFontAccu::CSStyle(int i, int j){
    GooString *Size=GooString::fromInt(font.getSize());
    GooString *colorStr=font.getColor().toString();
    GooString *fontName=(fontFullName ? font.getFullName() : font.getFontName());
+   GooString *familyName = font.getFamilyName();
+   GooString *embeddedFontName = font.getEmbeddedFontName();
    GooString *lSize;
    
    if(!xml){
@@ -375,15 +447,31 @@ GooString* HtmlFontAccu::CSStyle(int i, int j){
      tmp->append(";}");
    }
    if (xml) {
+     GooString *rotOrSkewedStr=GooString::fromInt(font.isRotOrSkewed() ? 1 : 0);
+
      tmp->append("<fontspec id=\"");
      tmp->append(iStr);
      tmp->append("\" size=\"");
      tmp->append(Size);
      tmp->append("\" family=\"");
      tmp->append(fontName); //font.getFontName());
+     if (familyName != nullptr)
+     {
+         tmp->append("\" fontFamily=\"");
+         tmp->append(familyName);
+     }
+     if (embeddedFontName != nullptr)
+     {
+         tmp->append("\" embeddedFontName=\"");
+         tmp->append(embeddedFontName);
+     }
+     tmp->append("\" rotOrSkewed=\"");
+     tmp->append(rotOrSkewedStr);
      tmp->append("\" color=\"");
      tmp->append(colorStr);
      tmp->append("\"/>");
+
+     delete rotOrSkewedStr;
    }
 
    delete fontName;
diff --git a/utils/HtmlFonts.h b/utils/HtmlFonts.h
index ba4f42ae..d5b24a64 100644
--- a/utils/HtmlFonts.h
+++ b/utils/HtmlFonts.h
@@ -68,6 +68,9 @@ class HtmlFont{
    int pos; // position of the font name in the fonts array
    static GooString *DefaultFont;
    GooString *FontName;
+   GooString *FamilyName;
+   Ref FontID;
+   GooString *EmbeddedFontName;
    HtmlFontColor color;
    double rotSkewMat[4]; // only four values needed for rotation and skew
 public:  
@@ -89,6 +92,9 @@ public:
    { rotOrSkewed = gTrue; memcpy(rotSkewMat, mat, sizeof(rotSkewMat)); }
    const double *getRotMat() const { return rotSkewMat; }
    GooString* getFontName();
+   GooString* getFamilyName() const;
+   Ref getFontID() const;
+   GooString* getEmbeddedFontName() const;
    static GooString* getDefaultFont();
    static void setDefaultFont(GooString* defaultFont);
    static GooString* HtmlFilter(const Unicode* u, int uLen); //char* s);
-- 
2.14.4

