提交 70361973 编写于 作者: E Eliot Jones

add support for reading encoding differences from font dictionary. add type 3 font support.

上级 18eeb896
......@@ -65,9 +65,9 @@
//[Fact]
//public void localFileTest()
//{
// using (var document = PdfDocument.Open(@"C:\Users\eliot\Downloads\CV.pdf"))
// //using (var document = PdfDocument.Open(@"C:\Users\eliot\Downloads\Document (1).pdf"))
// {
// var page = document.GetPage(1);
// //var page = document.GetPage(1);
// }
//}
}
......
namespace UglyToad.Pdf.Fonts.Encodings
{
using System;
using System.Collections.Generic;
using System.Linq;
/// <summary>
/// Created by combining a base encoding with the differences.
/// </summary>
internal class DifferenceBasedEncoding : Encoding
{
public override string EncodingName { get; } = "Difference Encoding";
public DifferenceBasedEncoding(Encoding baseEncoding, IReadOnlyList<(int, string)> differences)
{
if (baseEncoding == null)
{
throw new ArgumentNullException(nameof(baseEncoding));
}
if (differences == null)
{
throw new ArgumentNullException(nameof(differences));
}
EncodingName = "Difference " + baseEncoding.EncodingName;
foreach (var difference in differences)
{
Add(difference.Item1, difference.Item2);
}
foreach (var pair in baseEncoding.CodeToNameMap)
{
if (differences.All(x => x.Item1 != pair.Key))
{
Add(pair.Key, pair.Value);
}
}
}
}
}
......@@ -14,14 +14,16 @@
private readonly ILog log;
private readonly IReadOnlyDictionary<CosName, IFontHandler> handlers;
public FontFactory(ILog log, Type0FontHandler type0FontHandler, TrueTypeFontHandler trueTypeFontHandler, Type1FontHandler type1FontHandler)
public FontFactory(ILog log, Type0FontHandler type0FontHandler, TrueTypeFontHandler trueTypeFontHandler,
Type1FontHandler type1FontHandler, Type3FontHandler type3FontHandler)
{
this.log = log;
handlers = new Dictionary<CosName, IFontHandler>
{
{CosName.TYPE0, type0FontHandler},
{CosName.TRUE_TYPE, trueTypeFontHandler},
{CosName.TYPE1, type1FontHandler}
{CosName.TYPE1, type1FontHandler},
{CosName.TYPE3, type3FontHandler}
};
}
......
namespace UglyToad.Pdf.Fonts.Parser
{
using System.Collections.Generic;
using ContentStream;
using Cos;
using Encodings;
using Exceptions;
using IO;
using Pdf.Parser;
using Pdf.Parser.Parts;
internal class EncodingReader : IEncodingReader
{
private readonly IPdfObjectParser pdfObjectParser;
public EncodingReader(IPdfObjectParser pdfObjectParser)
{
this.pdfObjectParser = pdfObjectParser;
}
public Encoding Read(PdfDictionary fontDictionary, IRandomAccessRead reader, bool isLenientParsing, FontDescriptor descriptor = null)
{
if (!fontDictionary.TryGetValue(CosName.ENCODING, out var baseEncodingObject))
{
return null;
}
if (baseEncodingObject is CosName name)
{
return GetNamedEncoding(descriptor, name);
}
PdfDictionary encodingDictionary;
if (baseEncodingObject is CosObject reference)
{
encodingDictionary = DirectObjectFinder.Find<PdfDictionary>(reference, pdfObjectParser, reader, isLenientParsing);
}
else if (baseEncodingObject is PdfDictionary dictionary)
{
encodingDictionary = dictionary;
}
else
{
throw new InvalidFontFormatException($"The font encoding was not a named entry or dictionary, instead it was: {baseEncodingObject}.");
}
var encoding = ReadEncodingDictionary(encodingDictionary, reader, isLenientParsing);
return encoding;
}
private Encoding ReadEncodingDictionary(PdfDictionary encodingDictionary, IRandomAccessRead reader, bool isLenientParsing)
{
Encoding baseEncoding;
if (encodingDictionary.TryGetName(CosName.BASE_ENCODING, out var baseEncodingName))
{
if (!Encoding.TryGetNamedEncoding(baseEncodingName, out baseEncoding))
{
throw new InvalidFontFormatException($"No encoding found with name {baseEncodingName} to use as base encoding.");
}
}
else
{
// TODO: This isn't true for non-symbolic fonts or latin fonts (based on OS?) see section 5.5.5
baseEncoding = StandardEncoding.Instance;
}
if (!encodingDictionary.TryGetValue(CosName.DIFFERENCES, out var differencesBase))
{
return baseEncoding;
}
var differenceArray = differencesBase as COSArray;
if (differenceArray == null)
{
if (differencesBase is CosObject differencesObj)
{
differenceArray = DirectObjectFinder.Find<COSArray>(differencesObj, pdfObjectParser, reader, isLenientParsing);
}
else
{
throw new InvalidFontFormatException($"Differences was not an array: {differencesBase}.");
}
}
var differences = ProcessDifferences(differenceArray);
var newEncoding = new DifferenceBasedEncoding(baseEncoding, differences);
return newEncoding;
}
private static IReadOnlyList<(int, string)> ProcessDifferences(COSArray differenceArray)
{
var activeCode = differenceArray.getInt(0);
var differences = new List<(int, string)>();
for (int i = 1; i < differenceArray.Count; i++)
{
var entry = differenceArray.get(i);
if (entry is ICosNumber numeric)
{
activeCode = numeric.AsInt();
}
else if (entry is CosName name)
{
differences.Add((activeCode, name.Name));
activeCode++;
}
else
{
throw new InvalidFontFormatException($"Unexpected entry in the differences array: {differenceArray}.");
}
}
return differences;
}
private static Encoding GetNamedEncoding(FontDescriptor descriptor, CosName encodingName)
{
Encoding encoding;
// Symbolic fonts default to standard encoding.
if (descriptor?.Flags.HasFlag(FontFlags.Symbolic) == true)
{
encoding = StandardEncoding.Instance;
}
if (!Encoding.TryGetNamedEncoding(encodingName, out encoding))
{
// TODO: PDFBox would not throw here.
throw new InvalidFontFormatException($"Unrecognised encoding name: {encodingName}");
}
return encoding;
}
}
}
......@@ -22,17 +22,20 @@
private readonly CMapCache cMapCache;
private readonly FontDescriptorFactory fontDescriptorFactory;
private readonly TrueTypeFontParser trueTypeFontParser;
private readonly IEncodingReader encodingReader;
public TrueTypeFontHandler(IPdfObjectParser pdfObjectParser, IFilterProvider filterProvider,
CMapCache cMapCache,
FontDescriptorFactory fontDescriptorFactory,
TrueTypeFontParser trueTypeFontParser)
TrueTypeFontParser trueTypeFontParser,
IEncodingReader encodingReader)
{
this.pdfObjectParser = pdfObjectParser;
this.filterProvider = filterProvider;
this.cMapCache = cMapCache;
this.fontDescriptorFactory = fontDescriptorFactory;
this.trueTypeFontParser = trueTypeFontParser;
this.encodingReader = encodingReader;
}
public IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
......@@ -62,32 +65,7 @@
}
}
Encoding encoding = null;
if (dictionary.TryGetValue(CosName.ENCODING, out var encodingBase))
{
// Symbolic fonts default to standard encoding.
if (descriptor.Flags.HasFlag(FontFlags.Symbolic))
{
encoding = StandardEncoding.Instance;
}
if (encodingBase is CosName encodingName)
{
if (!Encoding.TryGetNamedEncoding(encodingName, out encoding))
{
// TODO: PDFBox would not throw here.
throw new InvalidFontFormatException($"Unrecognised encoding name: {encodingName}");
}
}
else if (encodingBase is CosDictionary encodingDictionary)
{
throw new NotImplementedException("No support for reading encoding from dictionary yet.");
}
else
{
throw new NotImplementedException("No support for reading encoding from font yet.");
}
}
Encoding encoding = encodingReader.Read(dictionary, reader, isLenientParsing, descriptor);
return new TrueTypeSimpleFont(name, firstCharacter, lastCharacter, widths, descriptor, toUnicodeCMap, encoding);
}
......
......@@ -18,13 +18,16 @@
private readonly CMapCache cMapCache;
private readonly IFilterProvider filterProvider;
private readonly FontDescriptorFactory fontDescriptorFactory;
private readonly IEncodingReader encodingReader;
public Type1FontHandler(IPdfObjectParser pdfObjectParser, CMapCache cMapCache, IFilterProvider filterProvider, FontDescriptorFactory fontDescriptorFactory)
public Type1FontHandler(IPdfObjectParser pdfObjectParser, CMapCache cMapCache, IFilterProvider filterProvider,
FontDescriptorFactory fontDescriptorFactory, IEncodingReader encodingReader)
{
this.pdfObjectParser = pdfObjectParser;
this.cMapCache = cMapCache;
this.filterProvider = filterProvider;
this.fontDescriptorFactory = fontDescriptorFactory;
this.encodingReader = encodingReader;
}
public IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
......@@ -66,32 +69,7 @@
}
}
Encoding encoding = null;
if (dictionary.TryGetValue(CosName.ENCODING, out var encodingBase))
{
// Symbolic fonts default to standard encoding.
if (descriptor.Flags.HasFlag(FontFlags.Symbolic))
{
encoding = StandardEncoding.Instance;
}
if (encodingBase is CosName encodingName)
{
if (!Encoding.TryGetNamedEncoding(encodingName, out encoding))
{
// TODO: PDFBox would not throw here.
throw new InvalidFontFormatException($"Unrecognised encoding name: {encodingName}");
}
}
else if (encodingBase is CosDictionary)
{
throw new NotImplementedException("No support for reading encoding from dictionary yet.");
}
else
{
throw new NotImplementedException("No support for reading encoding from font yet.");
}
}
Encoding encoding = encodingReader.Read(dictionary, reader, isLenientParsing, descriptor);
return new TrueTypeSimpleFont(name, firstCharacter, lastCharacter, widths, descriptor, toUnicodeCMap, encoding);
}
......
namespace UglyToad.Pdf.Fonts.Parser.Handlers
{
using System;
using Cmap;
using ContentStream;
using Core;
using Cos;
using Encodings;
using Exceptions;
using Filters;
using Geometry;
using IO;
using Pdf.Parser;
using Pdf.Parser.Parts;
using Simple;
internal class Type3FontHandler : IFontHandler
{
private readonly IPdfObjectParser pdfObjectParser;
private readonly CMapCache cMapCache;
private readonly IFilterProvider filterProvider;
private readonly IEncodingReader encodingReader;
public Type3FontHandler(IPdfObjectParser pdfObjectParser, CMapCache cMapCache, IFilterProvider filterProvider, IEncodingReader encodingReader)
{
this.pdfObjectParser = pdfObjectParser;
this.cMapCache = cMapCache;
this.filterProvider = filterProvider;
this.encodingReader = encodingReader;
}
public IFont Generate(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{
var boundingBox = GetBoundingBox(dictionary);
var fontMatrix = GetFontMatrix(dictionary, reader, isLenientParsing);
var firstCharacter = FontDictionaryAccessHelper.GetFirstCharacter(dictionary);
var lastCharacter = FontDictionaryAccessHelper.GetLastCharacter(dictionary);
var widths = FontDictionaryAccessHelper.GetWidths(pdfObjectParser, dictionary, reader, isLenientParsing);
Encoding encoding = encodingReader.Read(dictionary, reader, isLenientParsing);
CMap toUnicodeCMap = null;
if (dictionary.TryGetItemOfType(CosName.TO_UNICODE, out CosObject toUnicodeObj))
{
var toUnicode = pdfObjectParser.Parse(toUnicodeObj.ToIndirectReference(), reader, isLenientParsing) as PdfRawStream;
var decodedUnicodeCMap = toUnicode?.Decode(filterProvider);
if (decodedUnicodeCMap != null)
{
toUnicodeCMap = cMapCache.Parse(new ByteArrayInputBytes(decodedUnicodeCMap), isLenientParsing);
}
}
return new Type3Font(CosName.UNCHANGED, boundingBox, fontMatrix, encoding, firstCharacter,
lastCharacter, widths, toUnicodeCMap);
}
private TransformationMatrix GetFontMatrix(PdfDictionary dictionary, IRandomAccessRead reader, bool isLenientParsing)
{
if (!dictionary.TryGetValue(CosName.FONT_MATRIX, out var matrixObject))
{
throw new InvalidFontFormatException($"No font matrix found: {dictionary}.");
}
COSArray matrixArray;
if (matrixObject is COSArray arr)
{
matrixArray = arr;
}
else if (matrixObject is CosObject obj)
{
matrixArray = DirectObjectFinder.Find<COSArray>(obj, pdfObjectParser, reader, isLenientParsing);
}
else
{
throw new InvalidFontFormatException($"The font matrix object was not an array or reference to an array: {matrixObject}.");
}
return TransformationMatrix.FromValues(GetDecimal(matrixArray, 0), GetDecimal(matrixArray, 1),
GetDecimal(matrixArray, 2), GetDecimal(matrixArray, 3), GetDecimal(matrixArray, 4), GetDecimal(matrixArray, 5));
}
private Encoding GetEncoding(CosBase baseObject, IRandomAccessRead reader, bool isLenientParsing)
{
if (baseObject is CosObject obj)
{
baseObject = pdfObjectParser.Parse(obj.ToIndirectReference(), reader, isLenientParsing);
}
if (baseObject is CosName encodingName)
{
}
else if (baseObject is PdfDictionary dictionary)
{
}
else
{
throw new InvalidFontFormatException("");
}
throw new NotImplementedException();
}
private static decimal GetDecimal(COSArray array, int index)
{
if (index >= array.Count)
{
throw new InvalidFontFormatException($"The array did not contain enough entries to be the font matrix: {array}.");
}
var item = array.get(index) as ICosNumber;
if (item == null)
{
throw new InvalidFontFormatException($"The array did not contain a decimal at position {index}: {array}.");
}
return item.AsDecimal();
}
private static PdfRectangle GetBoundingBox(PdfDictionary dictionary)
{
if (!dictionary.TryGetValue(CosName.FONT_BBOX, out var bboxObject))
{
throw new InvalidFontFormatException($"Type 3 font was invalid. No Font Bounding Box: {dictionary}.");
}
if (bboxObject is COSArray bboxArray)
{
return new PdfRectangle(GetDecimal(bboxArray, 0), GetDecimal(bboxArray, 1),
GetDecimal(bboxArray, 2), GetDecimal(bboxArray, 3));
}
return new PdfRectangle(0, 0, 0, 0);
}
}
}
namespace UglyToad.Pdf.Fonts.Parser
{
using ContentStream;
using Encodings;
using IO;
internal interface IEncodingReader
{
Encoding Read(PdfDictionary fontDictionary, IRandomAccessRead reader, bool isLenientParsing, FontDescriptor descriptor = null);
}
}
\ No newline at end of file
namespace UglyToad.Pdf.Fonts.Simple
{
using Cmap;
using Composite;
using Core;
using Cos;
using Encodings;
using Exceptions;
using Geometry;
using IO;
internal class Type3Font : IFont
{
private readonly PdfRectangle boundingBox;
private readonly TransformationMatrix fontMatrix;
private readonly Encoding encoding;
private readonly int firstChar;
private readonly int lastChar;
private readonly decimal[] widths;
private readonly ToUnicodeCMap toUnicodeCMap;
public CosName Name { get; }
public bool IsVertical { get; } = false;
public Type3Font(CosName name, PdfRectangle boundingBox, TransformationMatrix fontMatrix,
Encoding encoding, int firstChar, int lastChar, decimal[] widths,
CMap toUnicodeCMap)
{
Name = name;
this.boundingBox = boundingBox;
this.fontMatrix = fontMatrix;
this.encoding = encoding;
this.firstChar = firstChar;
this.lastChar = lastChar;
this.widths = widths;
this.toUnicodeCMap = new ToUnicodeCMap(toUnicodeCMap);
}
public int ReadCharacterCode(IInputBytes bytes, out int codeLength)
{
codeLength = 1;
return bytes.CurrentByte;
}
public bool TryGetUnicode(int characterCode, out string value)
{
if (toUnicodeCMap.CanMapToUnicode)
{
return toUnicodeCMap.TryGet(characterCode, out value);
}
var name = encoding.GetName(characterCode);
var listed = GlyphList.AdobeGlyphList.NameToUnicode(name);
value = listed;
return true;
}
public PdfVector GetDisplacement(int characterCode)
{
return fontMatrix.Transform(new PdfVector(GetWidth(characterCode), 0));
}
public decimal GetWidth(int characterCode)
{
if (characterCode < firstChar || characterCode > lastChar)
{
throw new InvalidFontFormatException($"The character code was not contained in the widths array: {characterCode}.");
}
return widths[characterCode - firstChar];
}
public TransformationMatrix GetFontMatrix()
{
return fontMatrix;
}
}
}
......@@ -16,7 +16,6 @@
using IO;
using Logging;
using Parts;
using Parts.CrossReference;
using Tokenization.Scanner;
using Util;
......@@ -79,6 +78,7 @@
var fontDescriptorFactory = new FontDescriptorFactory();
var cidFontFactory = new CidFontFactory(fontDescriptorFactory, trueTypeFontParser, pdfObjectParser, filterProvider);
var encodingReader = new EncodingReader(pdfObjectParser);
var cMapCache = new CMapCache(new CMapParser());
......@@ -86,8 +86,9 @@
cMapCache,
filterProvider,
pdfObjectParser),
new TrueTypeFontHandler(pdfObjectParser, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser),
new Type1FontHandler(pdfObjectParser, cMapCache, filterProvider, fontDescriptorFactory));
new TrueTypeFontHandler(pdfObjectParser, filterProvider, cMapCache, fontDescriptorFactory, trueTypeFontParser, encodingReader),
new Type1FontHandler(pdfObjectParser, cMapCache, filterProvider, fontDescriptorFactory, encodingReader),
new Type3FontHandler(pdfObjectParser, cMapCache, filterProvider, encodingReader));
var dynamicParser = container.Get<DynamicParser>();
var resourceContainer = new ResourceContainer(pdfObjectParser, fontFactory);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册