OpenXML tag search

Not sure if the SDK is better but this works and produces a dictionary that contain the name of the tag and an element you could set the new value to:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Xml.Linq;

namespace ConsoleApplication8
{
    class Program
    {
        static void Main(string[] args)
        {
            Dictionary<string, XElement> lookupTable = new Dictionary<string, XElement>();
            Regex reg = new Regex(@"\<\!(?<TagName>.*)\!\>");

            XDocument doc = XDocument.Load("document.xml");
            XNamespace ns = doc.Root.GetNamespaceOfPrefix("w");
            IEnumerable<XElement> elements = doc.Root.Descendants(ns + "t").Where(x=> x.Value.StartsWith("<!")).ToArray();
            foreach (var item in elements)
            {
                #region remove the grammar tag
                //before
                XElement grammar = item.Parent.PreviousNode as XElement;
                grammar.Remove();
                //after
                grammar = item.Parent.NextNode as XElement;
                grammar.Remove();
                #endregion
                #region merge the two nodes and insert the name and the XElement to the dictionary
                XElement next = (item.Parent.NextNode as XElement).Element(ns + "t");
                string totalTagName = string.Format("{0}{1}", item.Value, next.Value);
                item.Parent.NextNode.Remove();
                item.Value = totalTagName;
                lookupTable.Add(reg.Match(totalTagName).Groups["TagName"].Value, item);
                #endregion
            }
            foreach (var item in lookupTable)
            {
                Console.WriteLine("The document contains a tag {0}" , item.Key);
                Console.WriteLine(item.Value.ToString());
            }


        }
    }
}

Edit:

A more complete example of the possible structure you can make:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Xml.Linq;
using System.IO.Compression; //you will have to add a reference to System.IO.Compression.FileSystem(.dll)
using System.IO;
using System.Text.RegularExpressions;

namespace ConsoleApplication28
{
    public class MyWordDocument
    {
        #region fields

        private string fileName;
        private XDocument document;
        //todo: create fields for all document xml files that can contain the placeholders

        private Dictionary<string, List<XElement>> lookUpTable;

        #endregion

        #region properties

        public IEnumerable<string> Tags { get { return lookUpTable.Keys; } }

        #endregion

        #region construction

        public MyWordDocument(string fileName)
        {
            this.fileName = fileName;
            ExtractDocument();
            CreateLookUp();
        }

        #endregion
        #region methods

        public void ReplaceTagWithValue(string tagName, string value)
        {
            foreach (var item in lookUpTable[tagName])
            {
                item.Value = item.Value.Replace(string.Format(@"<!{0}!>", tagName),value);
            }
        }

        public void Save(string fileName)
        {
            document.Save(@"temp\word\document.xml");
            //todo: save other parts of document here i.e. footer header or other stuff

            ZipFile.CreateFromDirectory("temp", fileName);
        }

        private void CreateLookUp()
        {
            //todo: make this work for all cases and for all files that can contain the placeholders
            //tip: open the raw document in word and replace the tags,
            //     save the file to different location and extract the xmlfiles of both versions and compare to see what you have to do
            lookUpTable = new Dictionary<string, List<XElement>>();
            Regex reg = new Regex(@"\<\!(?<TagName>.*)\!\>");
            document = XDocument.Load(@"temp\word\document.xml");
            XNamespace ns = document.Root.GetNamespaceOfPrefix("w");
            IEnumerable<XElement> elements = document.Root.Descendants(ns + "t").Where(NodeGotSplitUpIn2PartsDueToGrammarCheck).ToArray();
            foreach (var item in elements)
            {
                XElement grammar = item.Parent.PreviousNode as XElement;
                grammar.Remove();
                grammar = item.Parent.NextNode as XElement;
                grammar.Remove();
                XElement next = (item.Parent.NextNode as XElement).Element(ns + "t");
                string totalTagName = string.Format("{0}{1}", item.Value, next.Value);
                item.Parent.NextNode.Remove();
                item.Value = totalTagName;
                string tagName = reg.Match(totalTagName).Groups["TagName"].Value;
                if (lookUpTable.ContainsKey(tagName))
                {
                    lookUpTable[tagName].Add(item);
                }
                else
                {
                    lookUpTable.Add(tagName, new List<XElement> { item });
                }
            }
        }

        private bool NodeGotSplitUpIn2PartsDueToGrammarCheck(XElement node)
        {
            XNamespace ns = node.Document.Root.GetNamespaceOfPrefix("w");
            return node.Value.StartsWith("<!") && ((XElement)node.Parent.PreviousNode).Name == ns + "proofErr";
        }


        private void ExtractDocument()
        {
            if (!Directory.Exists("temp"))
            {
                Directory.CreateDirectory("temp");
            }
            else
            {
                Directory.Delete("temp",true);
                Directory.CreateDirectory("temp");
            }
            ZipFile.ExtractToDirectory(fileName, "temp");
        }

        #endregion
    }
}

and use it like this:

class Program
{
    static void Main(string[] args)
    {
        MyWordDocument doc = new MyWordDocument("somedoc.docx"); //todo: fix path

        foreach (string name in doc.Tags) //name would be the extracted name from the placeholder
        {
            doc.ReplaceTagWithValue(name, "Example");
        }

        doc.Save("output.docx"); //todo: fix path
    }
}

The problem with trying to find tags is that words are not always in the underlying XML in the format that they appear to be in Word. For example, in your sample XML the <!TAG1!> tag is split across multiple runs like this:

<w:r>
    <w:rPr>
        <w:lang w:val="en-GB"/>
    </w:rPr>
    <w:t>&lt;!TAG1</w:t>
</w:r>
<w:proofErr w:type="gramEnd"/>
    <w:r>
    <w:rPr>
        <w:lang w:val="en-GB"/>
    </w:rPr>
    <w:t>!&gt;</w:t>
</w:r>

As pointed out in the comments this is sometimes caused by the spelling and grammar checker but that's not all that can cause it. Having different styles on parts of the tag could also cause it for example.

One way of handling this is to find the InnerText of a Paragraph and compare that against your Regex. The InnerText property will return the plain text of the paragraph without any formatting or other XML within the underlying document getting in the way.

Once you have your tags, replacing the text is the next problem. Due to the above reasons you can't just replace the InnerText with some new text as it wouldn't be clear as to which parts of the text would belong in which Run. The easiest way round this is to remove any existing Run's and add a new Run with a Text property containing the new text.

The following code shows finding the tags and replacing them immediately rather than using two passes as you suggest in your question. This was just to make the example simpler to be honest. It should show everything you need.

private static void ReplaceTags(string filename)
{
    Regex regex = new Regex("<!(.)*?!>", RegexOptions.Compiled);

    using (WordprocessingDocument wordDocument = WordprocessingDocument.Open(filename, true))
    {
        //grab the header parts and replace tags there
        foreach (HeaderPart headerPart in wordDocument.MainDocumentPart.HeaderParts)
        {
            ReplaceParagraphParts(headerPart.Header, regex);
        }
        //now do the document
        ReplaceParagraphParts(wordDocument.MainDocumentPart.Document, regex);
        //now replace the footer parts
        foreach (FooterPart footerPart in wordDocument.MainDocumentPart.FooterParts)
        {
            ReplaceParagraphParts(footerPart.Footer, regex);
        }
    }
}

private static void ReplaceParagraphParts(OpenXmlElement element, Regex regex)
{
    foreach (var paragraph in element.Descendants<Paragraph>())
    {
        Match match = regex.Match(paragraph.InnerText);
        if (match.Success)
        {
            //create a new run and set its value to the correct text
            //this must be done before the child runs are removed otherwise
            //paragraph.InnerText will be empty
            Run newRun = new Run();
            newRun.AppendChild(new Text(paragraph.InnerText.Replace(match.Value, "some new value")));
            //remove any child runs
            paragraph.RemoveAllChildren<Run>();
            //add the newly created run
            paragraph.AppendChild(newRun);
        }
    }
}

One downside with the above approach is that any styles you may have had will be lost. These could be copied from the existing Run's but if there are multiple Run's with differing properties you'll need to work out which ones you need to copy where. There's nothing to stop you creating multiple Run's in the above code each with different properties if that's what is required. Other elements would also be lost (e.g. any symbols) so those would need to be accounted for too.

I have the same need as you do with the exception that I want to use ${...} entries instead of <!...!>. You could customize the code below to use your tags but it would require more states.

The following code works for xml as well as openxml nodes. I tested the code using xml, because when it comes to word documents it is hard to control how word arranges the paragraphs, runs & text elements. I guess it is not impossible, but this way I have more control:

static void Main(string[] args)
{
  //FillInValues(FileName("test01.docx"), FileName("test01_out.docx"));

  string[,] tests =
  {
    { "<r><t>${abc</t><t>}$</t><t>{tha}</t></r>", "<r><t>ABC</t><t>THA</t><t></t></r>"},
    { "<r><t>$</t><t>{</t><t>abc</t><t>}</t></r>", "<r><t>ABC</t><t></t></r>"},
    {"<r><t>${abc}</t></r>", "<r><t>ABC</t></r>" },
    {"<r><t>x${abc}</t></r>", "<r><t>xABC</t></r>" },
    {"<r><t>x${abc}y</t></r>", "<r><t>xABCy</t></r>" },
    {"<r><t>x${abc}${tha}z</t></r>", "<r><t>xABCTHAz</t></r>" },
    {"<r><t>x${abc}u${tha}z</t></r>", "<r><t>xABCuTHAz</t></r>" },
    {"<r><t>x${ab</t><t>c}u</t></r>", "<r><t>xABC</t><t>u</t></r>" },
    {"<r><t>x${ab</t><t>yupeekaiiei</t><t>c}u</t></r>", "<r><t>xABYUPEEKAIIEIC</t><t>u</t></r>" },
    {"<r><t>x${ab</t><t>yupeekaiiei</t><t>}</t></r>", "<r><t>xABYUPEEKAIIEI</t><t></t></r>" },

  };


  for (int i = 0; i < tests.GetLength(0); i++)
  {
    string value = tests[i, 0];
    string expectedValue = tests[i, 1];
    string actualValue = Test(value);
    Console.WriteLine($"{value} => {actualValue} == {expectedValue} = {actualValue == expectedValue}");

  }

  Console.WriteLine("Done!");
  Console.ReadLine();
}


public interface ITextReplacer
{
  string ReplaceValue(string value);
}

public class DefaultTextReplacer : ITextReplacer
{
  public string ReplaceValue(string value) { return $"{value.ToUpper()}"; }
}

public interface ITextElement
{
  string Value { get; set; }
  void RemoveFromParent();
}


public class XElementWrapper : ITextElement
{
  private XElement _element;

  public XElementWrapper(XElement element) { _element = element; }

  string ITextElement.Value
  {
    get { return _element.Value; }
    set { _element.Value = value; }
  }

  public XElement Element
  {
    get { return _element; }
    set { _element = value; }
  }

  public void RemoveFromParent()
  {
    _element.Remove();
  }


}

public class OpenXmlTextWrapper : ITextElement
{
  private Text _text;
  public OpenXmlTextWrapper(Text text) { _text = text; }

  public string Value
  {
    get { return _text.Text; }
    set { _text.Text = value; }
  }

  public Text Text
  {
    get { return _text; }
    set { _text = value; }
  }

  public void RemoveFromParent() { _text.Remove(); }
}


private static void FillInValues(string sourceFileName, string destFileName)
{
  File.Copy(sourceFileName, destFileName, true);

  using (WordprocessingDocument doc =
    WordprocessingDocument.Open(destFileName, true))
  {
    var body = doc.MainDocumentPart.Document.Body;
    var paras = body.Descendants<Paragraph>();

    SimpleStateMachine stateMachine = new SimpleStateMachine();

    //stateMachine.TextReplacer = <your implementation object >
    ProcessParagraphs(paras, stateMachine);
  }
}

private static void ProcessParagraphs(IEnumerable<Paragraph> paras, SimpleStateMachine stateMachine)
{
  foreach (var para in paras)
  {
    foreach (var run in para.Elements<Run>())
    {
      //Console.WriteLine("New run:");

      var texts = run.Elements<Text>().ToArray();

      for (int k = 0; k < texts.Length; k++)
      {
        OpenXmlTextWrapper wrapper = new OpenXmlTextWrapper(texts[k]);
        stateMachine.HandleText(wrapper);
      }
    }
  }
}

public class SimpleStateMachine
{
  // 0 - outside - initial state
  // 1 - $ matched
  // 2 - ${ matched
  // 3 - } - final state

  // 0 -> 1 $
  // 0 -> 0 anything other than $
  // 1 -> 2 {
  // 1 -> 0 anything other than {
  // 2 -> 3 }
  // 2 -> 2 anything other than }
  // 3 -> 0

  public ITextReplacer TextReplacer { get; set; } = new DefaultTextReplacer();
  public int State { get; set; } = 0;
  public List<ITextElement> TextsList { get; } = new List<ITextElement>();
  public StringBuilder Buffer { get; } = new StringBuilder();


  /// <summary>
  /// The index inside the Text element where the $ is found
  /// </summary>
  public int Position { get; set; }

  public void Reset()
  {
    State = 0;
    TextsList.Clear();
    Buffer.Clear();
  }

  public void Add(ITextElement text)
  {
    if (TextsList.Count == 0 || TextsList.Last() != text)
    {
      TextsList.Add(text);
    }
  }

  public void HandleText(ITextElement text)
  {
    // Scan the characters

    for (int i = 0; i < text.Value.Length; i++)
    {
      char c = text.Value[i];

      switch (State)
      {
        case 0:
          if (c == '$')
          {
            State = 1;
            Position = i;
            Add(text);
          }
          break;
        case 1:
          if (c == '{')
          {
            State = 2;
            Add(text);
          }
          else
          {
            Reset();
          }
          break;
        case 2:
          if (c == '}')
          {
            Add(text);

            Console.WriteLine("Found: " + Buffer);
            // We are on the final State
            // I will use the first text in the stack and discard the others


            // Here I am going to distinguish between whether I have only one item or more
            if (TextsList.Count == 1)
            {
              // Happy path - we have only one item - set the replacement value and then continue scanning
              string prefix = TextsList[0].Value.Substring(0, Position) + TextReplacer.ReplaceValue(Buffer.ToString());
              // Set the current index to point to the end of the prefix.The program will continue to with the next items
              TextsList[0].Value = prefix + TextsList[0].Value.Substring(i + 1);
              i = prefix.Length - 1;
              Reset();
            }
            else
            {
              // We have more than one item - discard the inbetweeners

              for (int j = 1; j < TextsList.Count - 1; j++)
              {
                TextsList[j].RemoveFromParent();
              }

              // I will set the value under the first Text item where the $ was found
              TextsList[0].Value = TextsList[0].Value.Substring(0, Position) + TextReplacer.ReplaceValue(Buffer.ToString());
              // Set the text for the current item to the remaining chars
              text.Value = text.Value.Substring(i + 1);
              i = -1;
              Reset();
            }
          }
          else
          {
            Buffer.Append(c);
            Add(text);
          }
          break;
      }
    }
  }
}

public static string Test(string xml)
{
  XElement root = XElement.Parse(xml);
  SimpleStateMachine stateMachine = new SimpleStateMachine();


  foreach (XElement element in root.Descendants()
    .Where(desc => !desc.Elements().Any()))
  {
    XElementWrapper wrapper = new XElementWrapper(element);
    stateMachine.HandleText(wrapper);
  }

  return root.ToString(SaveOptions.DisableFormatting);
}

I know my answer is late but it might be of use to others. Also make sure you test it. I am going to do more testing tomorrow with real documents. If I find any bugs I will fix the code here, but so far so good.

Update: the code doesn't work when the ${...} placeholders are placed in a table. This is a problem with the code that scans the document (the FillInValues function).

Update: I changed the code to scan all paragraphs.

OpenXML tag search

Tags:

C#

.Net

Ms Word

Openxml

Related

Recent Posts