From 70361973b37ac96a1c567c3d6038537192463c98 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Sun, 7 Jan 2018 14:49:17 +0000 Subject: [PATCH] add support for reading encoding differences from font dictionary. add type 3 font support. --- .../SwedishTouringCarChampionshipTests.cs | 4 +- .../Encodings/DifferenceBasedEncoding.cs | 42 ++++++ src/UglyToad.Pdf/Fonts/FontFactory.cs | 6 +- .../Fonts/Parser/EncodingReader.cs | 139 +++++++++++++++++ .../Parser/Handlers/TrueTypeFontHandler.cs | 32 +--- .../Fonts/Parser/Handlers/Type1FontHandler.cs | 32 +--- .../Fonts/Parser/Handlers/Type3FontHandler.cs | 142 ++++++++++++++++++ .../Fonts/Parser/IEncodingReader.cs | 11 ++ src/UglyToad.Pdf/Fonts/Simple/Type3Font.cs | 83 ++++++++++ src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs | 7 +- 10 files changed, 437 insertions(+), 61 deletions(-) create mode 100644 src/UglyToad.Pdf/Fonts/Encodings/DifferenceBasedEncoding.cs create mode 100644 src/UglyToad.Pdf/Fonts/Parser/EncodingReader.cs create mode 100644 src/UglyToad.Pdf/Fonts/Parser/Handlers/Type3FontHandler.cs create mode 100644 src/UglyToad.Pdf/Fonts/Parser/IEncodingReader.cs create mode 100644 src/UglyToad.Pdf/Fonts/Simple/Type3Font.cs diff --git a/src/UglyToad.Pdf.Tests/Integration/SwedishTouringCarChampionshipTests.cs b/src/UglyToad.Pdf.Tests/Integration/SwedishTouringCarChampionshipTests.cs index 0876a467..1eb742c9 100644 --- a/src/UglyToad.Pdf.Tests/Integration/SwedishTouringCarChampionshipTests.cs +++ b/src/UglyToad.Pdf.Tests/Integration/SwedishTouringCarChampionshipTests.cs @@ -65,9 +65,9 @@ //[Fact] //public void localFileTest() //{ - // using (var document = PdfDocument.Open(@"C:\Users\eliot\Downloads\CV.pdf")) + // //using (var document = PdfDocument.Open(@"C:\Users\eliot\Downloads\Document (1).pdf")) // { - // var page = document.GetPage(1); + // //var page = document.GetPage(1); // } //} } diff --git a/src/UglyToad.Pdf/Fonts/Encodings/DifferenceBasedEncoding.cs b/src/UglyToad.Pdf/Fonts/Encodings/DifferenceBasedEncoding.cs new file mode 100644 index 00000000..dd9e74d9 --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Encodings/DifferenceBasedEncoding.cs @@ -0,0 +1,42 @@ +namespace UglyToad.Pdf.Fonts.Encodings +{ + using System; + using System.Collections.Generic; + using System.Linq; + + /// + /// Created by combining a base encoding with the differences. + /// + internal class DifferenceBasedEncoding : Encoding + { + public override string EncodingName { get; } = "Difference Encoding"; + + public DifferenceBasedEncoding(Encoding baseEncoding, IReadOnlyList<(int, string)> differences) + { + if (baseEncoding == null) + { + throw new ArgumentNullException(nameof(baseEncoding)); + } + + if (differences == null) + { + throw new ArgumentNullException(nameof(differences)); + } + + EncodingName = "Difference " + baseEncoding.EncodingName; + + foreach (var difference in differences) + { + Add(difference.Item1, difference.Item2); + } + + foreach (var pair in baseEncoding.CodeToNameMap) + { + if (differences.All(x => x.Item1 != pair.Key)) + { + Add(pair.Key, pair.Value); + } + } + } + } +} diff --git a/src/UglyToad.Pdf/Fonts/FontFactory.cs b/src/UglyToad.Pdf/Fonts/FontFactory.cs index 266cbb7d..14ff5b64 100644 --- a/src/UglyToad.Pdf/Fonts/FontFactory.cs +++ b/src/UglyToad.Pdf/Fonts/FontFactory.cs @@ -14,14 +14,16 @@ private readonly ILog log; private readonly IReadOnlyDictionary handlers; - public FontFactory(ILog log, Type0FontHandler type0FontHandler, TrueTypeFontHandler trueTypeFontHandler, Type1FontHandler type1FontHandler) + public FontFactory(ILog log, Type0FontHandler type0FontHandler, TrueTypeFontHandler trueTypeFontHandler, + Type1FontHandler type1FontHandler, Type3FontHandler type3FontHandler) { this.log = log; handlers = new Dictionary { {CosName.TYPE0, type0FontHandler}, {CosName.TRUE_TYPE, trueTypeFontHandler}, - {CosName.TYPE1, type1FontHandler} + {CosName.TYPE1, type1FontHandler}, + {CosName.TYPE3, type3FontHandler} }; } diff --git a/src/UglyToad.Pdf/Fonts/Parser/EncodingReader.cs b/src/UglyToad.Pdf/Fonts/Parser/EncodingReader.cs new file mode 100644 index 00000000..a4a18a20 --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Parser/EncodingReader.cs @@ -0,0 +1,139 @@ +namespace UglyToad.Pdf.Fonts.Parser +{ + using System.Collections.Generic; + using ContentStream; + using Cos; + using Encodings; + using Exceptions; + using IO; + using Pdf.Parser; + using Pdf.Parser.Parts; + + internal class EncodingReader : IEncodingReader + { + private readonly IPdfObjectParser pdfObjectParser; + + public EncodingReader(IPdfObjectParser pdfObjectParser) + { + this.pdfObjectParser = pdfObjectParser; + } + + public Encoding Read(PdfDictionary fontDictionary, IRandomAccessRead reader, bool isLenientParsing, FontDescriptor descriptor = null) + { + if (!fontDictionary.TryGetValue(CosName.ENCODING, out var baseEncodingObject)) + { + return null; + } + + if (baseEncodingObject is CosName name) + { + return GetNamedEncoding(descriptor, name); + } + + PdfDictionary encodingDictionary; + if (baseEncodingObject is CosObject reference) + { + encodingDictionary = DirectObjectFinder.Find(reference, pdfObjectParser, reader, isLenientParsing); + } + else if (baseEncodingObject is PdfDictionary dictionary) + { + encodingDictionary = dictionary; + } + else + { + throw new InvalidFontFormatException($"The font encoding was not a named entry or dictionary, instead it was: {baseEncodingObject}."); + } + + var encoding = ReadEncodingDictionary(encodingDictionary, reader, isLenientParsing); + + return encoding; + } + + private Encoding ReadEncodingDictionary(PdfDictionary encodingDictionary, IRandomAccessRead reader, bool isLenientParsing) + { + Encoding baseEncoding; + if (encodingDictionary.TryGetName(CosName.BASE_ENCODING, out var baseEncodingName)) + { + if (!Encoding.TryGetNamedEncoding(baseEncodingName, out baseEncoding)) + { + throw new InvalidFontFormatException($"No encoding found with name {baseEncodingName} to use as base encoding."); + } + } + else + { + // TODO: This isn't true for non-symbolic fonts or latin fonts (based on OS?) see section 5.5.5 + baseEncoding = StandardEncoding.Instance; + } + + if (!encodingDictionary.TryGetValue(CosName.DIFFERENCES, out var differencesBase)) + { + return baseEncoding; + } + + var differenceArray = differencesBase as COSArray; + if (differenceArray == null) + { + if (differencesBase is CosObject differencesObj) + { + differenceArray = DirectObjectFinder.Find(differencesObj, pdfObjectParser, reader, isLenientParsing); + } + else + { + throw new InvalidFontFormatException($"Differences was not an array: {differencesBase}."); + } + } + + var differences = ProcessDifferences(differenceArray); + + var newEncoding = new DifferenceBasedEncoding(baseEncoding, differences); + + return newEncoding; + } + + private static IReadOnlyList<(int, string)> ProcessDifferences(COSArray differenceArray) + { + var activeCode = differenceArray.getInt(0); + var differences = new List<(int, string)>(); + + for (int i = 1; i < differenceArray.Count; i++) + { + var entry = differenceArray.get(i); + + if (entry is ICosNumber numeric) + { + activeCode = numeric.AsInt(); + } + else if (entry is CosName name) + { + differences.Add((activeCode, name.Name)); + activeCode++; + } + else + { + throw new InvalidFontFormatException($"Unexpected entry in the differences array: {differenceArray}."); + } + } + + return differences; + } + + private static Encoding GetNamedEncoding(FontDescriptor descriptor, CosName encodingName) + { + Encoding encoding; + // Symbolic fonts default to standard encoding. + if (descriptor?.Flags.HasFlag(FontFlags.Symbolic) == true) + { + encoding = StandardEncoding.Instance; + } + + if (!Encoding.TryGetNamedEncoding(encodingName, out encoding)) + { + // TODO: PDFBox would not throw here. + throw new InvalidFontFormatException($"Unrecognised encoding name: {encodingName}"); + } + + return encoding; + } + } +} + diff --git a/src/UglyToad.Pdf/Fonts/Parser/Handlers/TrueTypeFontHandler.cs b/src/UglyToad.Pdf/Fonts/Parser/Handlers/TrueTypeFontHandler.cs index 5283b68c..2ff384df 100644 --- a/src/UglyToad.Pdf/Fonts/Parser/Handlers/TrueTypeFontHandler.cs +++ b/src/UglyToad.Pdf/Fonts/Parser/Handlers/TrueTypeFontHandler.cs @@ -22,17 +22,20 @@ private readonly CMapCache cMapCache; private readonly FontDescriptorFactory fontDescriptorFactory; private readonly TrueTypeFontParser trueTypeFontParser; + private readonly IEncodingReader encodingReader; public TrueTypeFontHandler(IPdfObjectParser pdfObjectParser, IFilterProvider filterProvider, CMapCache cMapCache, FontDescriptorFactory fontDescriptorFactory, - TrueTypeFontParser trueTypeFontParser) + TrueTypeFontParser trueTypeFontParser, + IEncodingReader encodingReader) { this.pdfObjectParser = pdfObjectParser; this.filterProvider = filterProvider; this.cMapCache = cMapCache; this.fontDescriptorFactory = fontDescriptorFactory; this.trueTypeFontParser = trueTypeFontParser; + this.encodingReader = encodingReader; } public IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing) @@ -62,32 +65,7 @@ } } - Encoding encoding = null; - if (dictionary.TryGetValue(CosName.ENCODING, out var encodingBase)) - { - // Symbolic fonts default to standard encoding. - if (descriptor.Flags.HasFlag(FontFlags.Symbolic)) - { - encoding = StandardEncoding.Instance; - } - - if (encodingBase is CosName encodingName) - { - if (!Encoding.TryGetNamedEncoding(encodingName, out encoding)) - { - // TODO: PDFBox would not throw here. - throw new InvalidFontFormatException($"Unrecognised encoding name: {encodingName}"); - } - } - else if (encodingBase is CosDictionary encodingDictionary) - { - throw new NotImplementedException("No support for reading encoding from dictionary yet."); - } - else - { - throw new NotImplementedException("No support for reading encoding from font yet."); - } - } + Encoding encoding = encodingReader.Read(dictionary, reader, isLenientParsing, descriptor); return new TrueTypeSimpleFont(name, firstCharacter, lastCharacter, widths, descriptor, toUnicodeCMap, encoding); } diff --git a/src/UglyToad.Pdf/Fonts/Parser/Handlers/Type1FontHandler.cs b/src/UglyToad.Pdf/Fonts/Parser/Handlers/Type1FontHandler.cs index 5b95a810..c0e11d49 100644 --- a/src/UglyToad.Pdf/Fonts/Parser/Handlers/Type1FontHandler.cs +++ b/src/UglyToad.Pdf/Fonts/Parser/Handlers/Type1FontHandler.cs @@ -18,13 +18,16 @@ private readonly CMapCache cMapCache; private readonly IFilterProvider filterProvider; private readonly FontDescriptorFactory fontDescriptorFactory; + private readonly IEncodingReader encodingReader; - public Type1FontHandler(IPdfObjectParser pdfObjectParser, CMapCache cMapCache, IFilterProvider filterProvider, FontDescriptorFactory fontDescriptorFactory) + public Type1FontHandler(IPdfObjectParser pdfObjectParser, CMapCache cMapCache, IFilterProvider filterProvider, + FontDescriptorFactory fontDescriptorFactory, IEncodingReader encodingReader) { this.pdfObjectParser = pdfObjectParser; this.cMapCache = cMapCache; this.filterProvider = filterProvider; this.fontDescriptorFactory = fontDescriptorFactory; + this.encodingReader = encodingReader; } public IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing) @@ -66,32 +69,7 @@ } } - Encoding encoding = null; - if (dictionary.TryGetValue(CosName.ENCODING, out var encodingBase)) - { - // Symbolic fonts default to standard encoding. - if (descriptor.Flags.HasFlag(FontFlags.Symbolic)) - { - encoding = StandardEncoding.Instance; - } - - if (encodingBase is CosName encodingName) - { - if (!Encoding.TryGetNamedEncoding(encodingName, out encoding)) - { - // TODO: PDFBox would not throw here. - throw new InvalidFontFormatException($"Unrecognised encoding name: {encodingName}"); - } - } - else if (encodingBase is CosDictionary) - { - throw new NotImplementedException("No support for reading encoding from dictionary yet."); - } - else - { - throw new NotImplementedException("No support for reading encoding from font yet."); - } - } + Encoding encoding = encodingReader.Read(dictionary, reader, isLenientParsing, descriptor); return new TrueTypeSimpleFont(name, firstCharacter, lastCharacter, widths, descriptor, toUnicodeCMap, encoding); } diff --git a/src/UglyToad.Pdf/Fonts/Parser/Handlers/Type3FontHandler.cs b/src/UglyToad.Pdf/Fonts/Parser/Handlers/Type3FontHandler.cs new file mode 100644 index 00000000..e7720453 --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Parser/Handlers/Type3FontHandler.cs @@ -0,0 +1,142 @@ +namespace UglyToad.Pdf.Fonts.Parser.Handlers +{ + using System; + using Cmap; + using ContentStream; + using Core; + using Cos; + using Encodings; + using Exceptions; + using Filters; + using Geometry; + using IO; + using Pdf.Parser; + using Pdf.Parser.Parts; + using Simple; + + internal class Type3FontHandler : IFontHandler + { + private readonly IPdfObjectParser pdfObjectParser; + private readonly CMapCache cMapCache; + private readonly IFilterProvider filterProvider; + private readonly IEncodingReader encodingReader; + + public Type3FontHandler(IPdfObjectParser pdfObjectParser, CMapCache cMapCache, IFilterProvider filterProvider, IEncodingReader encodingReader) + { + this.pdfObjectParser = pdfObjectParser; + this.cMapCache = cMapCache; + this.filterProvider = filterProvider; + this.encodingReader = encodingReader; + } + + public IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing) + { + var boundingBox = GetBoundingBox(dictionary); + + var fontMatrix = GetFontMatrix(dictionary, reader, isLenientParsing); + + var firstCharacter = FontDictionaryAccessHelper.GetFirstCharacter(dictionary); + var lastCharacter = FontDictionaryAccessHelper.GetLastCharacter(dictionary); + var widths = FontDictionaryAccessHelper.GetWidths(pdfObjectParser, dictionary, reader, isLenientParsing); + + Encoding encoding = encodingReader.Read(dictionary, reader, isLenientParsing); + + CMap toUnicodeCMap = null; + if (dictionary.TryGetItemOfType(CosName.TO_UNICODE, out CosObject toUnicodeObj)) + { + var toUnicode = pdfObjectParser.Parse(toUnicodeObj.ToIndirectReference(), reader, isLenientParsing) as PdfRawStream; + + var decodedUnicodeCMap = toUnicode?.Decode(filterProvider); + + if (decodedUnicodeCMap != null) + { + toUnicodeCMap = cMapCache.Parse(new ByteArrayInputBytes(decodedUnicodeCMap), isLenientParsing); + } + } + + return new Type3Font(CosName.UNCHANGED, boundingBox, fontMatrix, encoding, firstCharacter, + lastCharacter, widths, toUnicodeCMap); + } + + private TransformationMatrix GetFontMatrix(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing) + { + if (!dictionary.TryGetValue(CosName.FONT_MATRIX, out var matrixObject)) + { + throw new InvalidFontFormatException($"No font matrix found: {dictionary}."); + } + + COSArray matrixArray; + if (matrixObject is COSArray arr) + { + matrixArray = arr; + } + else if (matrixObject is CosObject obj) + { + matrixArray = DirectObjectFinder.Find(obj, pdfObjectParser, reader, isLenientParsing); + } + else + { + throw new InvalidFontFormatException($"The font matrix object was not an array or reference to an array: {matrixObject}."); + } + + return TransformationMatrix.FromValues(GetDecimal(matrixArray, 0), GetDecimal(matrixArray, 1), + GetDecimal(matrixArray, 2), GetDecimal(matrixArray, 3), GetDecimal(matrixArray, 4), GetDecimal(matrixArray, 5)); + } + + private Encoding GetEncoding(CosBase baseObject, IRandomAccessRead reader, bool isLenientParsing) + { + if (baseObject is CosObject obj) + { + baseObject = pdfObjectParser.Parse(obj.ToIndirectReference(), reader, isLenientParsing); + } + + if (baseObject is CosName encodingName) + { + + } + else if (baseObject is PdfDictionary dictionary) + { + + } + else + { + throw new InvalidFontFormatException(""); + } + + throw new NotImplementedException(); + } + + private static decimal GetDecimal(COSArray array, int index) + { + if (index >= array.Count) + { + throw new InvalidFontFormatException($"The array did not contain enough entries to be the font matrix: {array}."); + } + + var item = array.get(index) as ICosNumber; + + if (item == null) + { + throw new InvalidFontFormatException($"The array did not contain a decimal at position {index}: {array}."); + } + + return item.AsDecimal(); + } + + private static PdfRectangle GetBoundingBox(PdfDictionary dictionary) + { + if (!dictionary.TryGetValue(CosName.FONT_BBOX, out var bboxObject)) + { + throw new InvalidFontFormatException($"Type 3 font was invalid. No Font Bounding Box: {dictionary}."); + } + + if (bboxObject is COSArray bboxArray) + { + return new PdfRectangle(GetDecimal(bboxArray, 0), GetDecimal(bboxArray, 1), + GetDecimal(bboxArray, 2), GetDecimal(bboxArray, 3)); + } + + return new PdfRectangle(0, 0, 0, 0); + } + } +} diff --git a/src/UglyToad.Pdf/Fonts/Parser/IEncodingReader.cs b/src/UglyToad.Pdf/Fonts/Parser/IEncodingReader.cs new file mode 100644 index 00000000..b13a66cd --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Parser/IEncodingReader.cs @@ -0,0 +1,11 @@ +namespace UglyToad.Pdf.Fonts.Parser +{ + using ContentStream; + using Encodings; + using IO; + + internal interface IEncodingReader + { + Encoding Read(PdfDictionary fontDictionary, IRandomAccessRead reader, bool isLenientParsing, FontDescriptor descriptor = null); + } +} \ No newline at end of file diff --git a/src/UglyToad.Pdf/Fonts/Simple/Type3Font.cs b/src/UglyToad.Pdf/Fonts/Simple/Type3Font.cs new file mode 100644 index 00000000..207eed1d --- /dev/null +++ b/src/UglyToad.Pdf/Fonts/Simple/Type3Font.cs @@ -0,0 +1,83 @@ +namespace UglyToad.Pdf.Fonts.Simple +{ + using Cmap; + using Composite; + using Core; + using Cos; + using Encodings; + using Exceptions; + using Geometry; + using IO; + + internal class Type3Font : IFont + { + private readonly PdfRectangle boundingBox; + private readonly TransformationMatrix fontMatrix; + private readonly Encoding encoding; + private readonly int firstChar; + private readonly int lastChar; + private readonly decimal[] widths; + private readonly ToUnicodeCMap toUnicodeCMap; + + public CosName Name { get; } + + public bool IsVertical { get; } = false; + + public Type3Font(CosName name, PdfRectangle boundingBox, TransformationMatrix fontMatrix, + Encoding encoding, int firstChar, int lastChar, decimal[] widths, + CMap toUnicodeCMap) + { + Name = name; + + this.boundingBox = boundingBox; + this.fontMatrix = fontMatrix; + this.encoding = encoding; + this.firstChar = firstChar; + this.lastChar = lastChar; + this.widths = widths; + this.toUnicodeCMap = new ToUnicodeCMap(toUnicodeCMap); + } + + public int ReadCharacterCode(IInputBytes bytes, out int codeLength) + { + codeLength = 1; + return bytes.CurrentByte; + } + + public bool TryGetUnicode(int characterCode, out string value) + { + if (toUnicodeCMap.CanMapToUnicode) + { + return toUnicodeCMap.TryGet(characterCode, out value); + } + + var name = encoding.GetName(characterCode); + + var listed = GlyphList.AdobeGlyphList.NameToUnicode(name); + + value = listed; + + return true; + } + + public PdfVector GetDisplacement(int characterCode) + { + return fontMatrix.Transform(new PdfVector(GetWidth(characterCode), 0)); + } + + public decimal GetWidth(int characterCode) + { + if (characterCode < firstChar || characterCode > lastChar) + { + throw new InvalidFontFormatException($"The character code was not contained in the widths array: {characterCode}."); + } + + return widths[characterCode - firstChar]; + } + + public TransformationMatrix GetFontMatrix() + { + return fontMatrix; + } + } +} diff --git a/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs b/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs index f1c070f0..e7864406 100644 --- a/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.Pdf/Parser/PdfDocumentFactory.cs @@ -16,7 +16,6 @@ using IO; using Logging; using Parts; - using Parts.CrossReference; using Tokenization.Scanner; using Util; @@ -79,6 +78,7 @@ var fontDescriptorFactory = new FontDescriptorFactory(); var cidFontFactory = new CidFontFactory(fontDescriptorFactory, trueTypeFontParser, pdfObjectParser, filterProvider); + var encodingReader = new EncodingReader(pdfObjectParser); var cMapCache = new CMapCache(new CMapParser()); @@ -86,8 +86,9 @@ cMapCache, filterProvider, pdfObjectParser), - new TrueTypeFontHandler(pdfObjectParser, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser), - new Type1FontHandler(pdfObjectParser, cMapCache, filterProvider, fontDescriptorFactory)); + new TrueTypeFontHandler(pdfObjectParser, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser, encodingReader), + new Type1FontHandler(pdfObjectParser, cMapCache, filterProvider, fontDescriptorFactory, encodingReader), + new Type3FontHandler(pdfObjectParser, cMapCache, filterProvider, encodingReader)); var dynamicParser = container.Get(); var resourceContainer = new ResourceContainer(pdfObjectParser, fontFactory); -- GitLab