Search PDF content in sitecore

To people who have not tried to do this themselves, this seems like and easy task. All we need to do is get all the text content and load it in the search index. Initially i thought i had a good solution with PdfSharp using code that i found from this stack overflow post.  It seemed to be working fine until i attempted to run my site on Azure.   It apparently uses lower level OS based API calls that are just not available on Azure using the new Sitecore Paas setup.

There are several paid libraries that claim to be able to accomplish just this, however like most developers i wasn’t about to pitch buying a license to read PDF content to my clients. So the search continued.  After many hours (which i hope to save you from here) i came across a solution that did the trick (for the most part).

Reading PDF content

This code does require PdfSharp as a dependency, get it here on nuget.

NOTE: this code was adapted from this stack overflow post and is not entirely my own.  Although i don’t think it’s the poster on stack overflow who originated the code either.  Credit is due somewhere, but not quite sure where.

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using PdfSharp.Pdf;
using PdfSharp.Pdf.IO;
using Sitecore.Data.Items;

namespace IHN.Feature.Component
{
	/// <summary>
	/// Addapted from code found here http://stackoverflow.com/questions/83152/reading-pdf-documents-in-net
	/// </summary>

	public class SitecorePdfParser
	{
		private int _numberOfCharsToKeep = 15;
		private PdfDocument _doc;

		public SitecorePdfParser(Item item): this(new MediaItem(item))
		{
		}
		public SitecorePdfParser(MediaItem item)
		{
			if (item.MimeType != "application/pdf")
				return;
			Stream s = item.GetMediaStream();
			_doc = PdfReader.Open(s);
		}

		public SitecorePdfParser(PdfDocument document)
		{
			_doc = document;
		}

		public IEnumerable<string> ExtractText()
		{
			if (_doc == null)
				yield break;
			foreach (PdfPage page in _doc.Pages)
			{
				for (int index = 0; index < page.Contents.Elements.Count; index++)
				{

					PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream;
					foreach (string text in ExtractTextFromPdfBytes(stream.Value))
					{
						yield return text;
					}
				}
			}
		}
		/// <summary>
		/// This method processes an uncompressed Adobe (text) object
		/// and extracts text.
		/// </summary>

		/// <param name="input">uncompressed</param>
		/// <returns></returns>
		public IEnumerable<string> ExtractTextFromPdfBytes(byte[] input)
		{
			if (input == null || input.Length == 0) yield break;
			StringBuilder resultString = new StringBuilder();
			bool inTextObject = false;
			bool nextLiteral = false;
			int bracketDepth = 0;
			char[] previousCharacters = new char[_numberOfCharsToKeep];
			for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' '; 			foreach (byte t in input) 			{ 				char c = (char)t; 				if (inTextObject) 				{ 					// Position the text 					if (bracketDepth == 0) 					{ 						if (CheckToken(new[] { "TD", "Td" }, previousCharacters) || CheckToken(new[] { "'", "T*", "\"" }, previousCharacters) || CheckToken(new[] { "Tj" }, previousCharacters)) 						{ 							if (resultString.Length > 0)
							{
								yield return CleanupContent(resultString.ToString());
								resultString.Clear();
							}
						}
					}

					if (bracketDepth == 0 &&
						CheckToken(new string[] { "ET" }, previousCharacters))
					{
						inTextObject = false;
						if (resultString.Length > 0)
						{
							yield return CleanupContent(resultString.ToString());
							resultString.Clear();
						}
						continue;
					}

					if (c == '(' && bracketDepth == 0 && !nextLiteral)
					{
						bracketDepth = 1;
					}
					else if (c == ')' && bracketDepth == 1 && !nextLiteral)
					{
						bracketDepth = 0;
					}
					else if (bracketDepth == 1)
					{
						if (c == '\\' && !nextLiteral)
						{
							nextLiteral = true;
						}
						else
						{
							if (c == ' ')
							{
								if (resultString.Length > 0)
								{
									yield return CleanupContent(resultString.ToString());
									resultString.Clear();
								}
							}
							else if ((c >= '!' && c <= '~') || 									 (c >= 128 && c < 255))
							{
								resultString.Append(c);
							}
							nextLiteral = false;
						}
					}
				}

				// Store the recent characters for
				// when we have to go back for a checking
				for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
				{
					previousCharacters[j] = previousCharacters[j + 1];
				}
				previousCharacters[_numberOfCharsToKeep - 1] = c;

				if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
				{
					inTextObject = true;
				}
			}
		}
		private string CleanupContent(string text)
		{
			string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221" };
			string[] replace = { "(", ")", "-", "'", "\"", "\"", "à", "â", "ä", "À", "Â", "Ä", "é", "è", "ê", "ë", "É", "È", "Ê", "Ë", "ò", "ô", "ö", "Ò", "Ô", "Ö", "ì", "î", "ï", "Ì", "Î", "Ï", "ç", "Ç", "ù", "û", "ü", "Ù", "Û", "Ü", "®", "™", "«", "»", "©", "'" };

			for (int i = 0; i < patterns.Length; i++)
			{
				string regExPattern = patterns[i];
				Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
				text = regex.Replace(text, replace[i]);
			}

			return text;
		}
		/// <summary>
		/// Check if a certain 2 character token just came along (e.g. BT)
		/// </summary>

		/// <param name="search">the searched token</param>
		/// <param name="recent">the recent character array</param>
		/// <returns></returns>
		private bool CheckToken(string[] tokens, char[] recent)
		{
			foreach (string token in tokens)
			{
				if (token.Length > 1)
				{
					if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
						(recent[_numberOfCharsToKeep - 2] == token[1]) &&
						((recent[_numberOfCharsToKeep - 1] == ' ') ||
						(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
						(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
						((recent[_numberOfCharsToKeep - 4] == ' ') ||
						(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
						(recent[_numberOfCharsToKeep - 4] == 0x0a))
						)
					{
						return true;
					}
				}
				else
				{
					return false;
				}
			}
			return false;
		}
	}
}

Then we need to wire this up to the index crawler to make sure that the index uses this class to populate the search index with our Pdf content.

We need to implement a Sitecore IComputedIndexField class to accomplish this.

	public class IndexPdfContent : IComputedIndexField
	{
		public object ComputeFieldValue(IIndexable indexable)
		{
			try
			{
				var sitecoreIndexable = indexable as SitecoreIndexableItem;

				if (sitecoreIndexable == null) return null;

				var pdfContent = new SitecorePdfParser(new MediaItem(sitecoreIndexable)).ExtractText().ToList();

				if (pdfContent.Count == 0) return null;

				return string.Join(" ", pdfContent);
			}
			catch (Exception e)
			{
				Log.Error("Unable to assemble PDF content for the search index ", e, this);
				return null;
			}
		}
	}

And finally wire it up to the indexer

<configuration xmlns:patch="http://www.sitecore.net/xmlconfig/">
	<sitecore>
		<contentSearch>
			<indexConfigurations>
				<defaultLuceneIndexConfiguration>
					<documentOptions>
						<fields hint="raw:AddComputedIndexField">
							<!-- indexes pdf contents into index _content field to allow PDF search -->
							<field fieldName="_pdfcontent" type="[NAMESPACE].IndexPdfContent, [DLL NAME]" />
						</fields>
					</documentOptions>
				</defaultLuceneIndexConfiguration>
				<defaultSolrIndexConfiguration>
					<documentOptions>
						<fields hint="raw:AddComputedIndexField">
							<!-- indexes pdf contents into index _content field to allow PDF search -->
							<field fieldName="_pdfcontent" type="[NAMESPACE].IndexPdfContent, [DLL NAME]" />
						</fields>
					</documentOptions>
				</defaultSolrIndexConfiguration>
				<defaultCloudIndexConfiguration>
					<documentOptions>
						<fields hint="raw:AddComputedIndexField">
							<!-- indexes pdf contents into index _content field to allow PDF search -->
							<field fieldName="pdf_content" cloudFieldName="pdf_content" type="[NAMESPACE].IndexPdfContent, [DLL NAME]" />
						</fields>
					</documentOptions>
				</defaultCloudIndexConfiguration>
			</indexConfigurations>
		</contentSearch>
	</sitecore>
</configuration>

Ending Results

Now we have our search index populated with PDF contents. So if someone wants to find a PDF with a text search it’s as simple as querying the index on the field assigned in the xml with the users search text.

Disclaimer

While this solution is quite good, it’s not perfect. If you have text in PDF images, it won’t find that. Additionally I’ve noticed that in rare cases words might be broken up when they’re being extracted. Presumably this is due to PDF formatting. If you happen to figure out how to resolve this completely, let me know and i’d love to update this code.