Skip to content

Commit a8381f4

Browse files
committed
Support sections in RTF to DOCX converter
1 parent 011bfc3 commit a8381f4

1 file changed

Lines changed: 211 additions & 16 deletions

File tree

src/DocSharp.Docx/RtfToDocx/RtfToDocxConverter.cs

Lines changed: 211 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ public class RtfToDocxConverter : ITextToDocxConverter
2222
private List<(int R, int G, int B)> colorTable = new();
2323
private Encoding? codePageEncoding;
2424
private BorderType? currentBorder;
25+
private SectionProperties? defaultSectPr;
26+
private SectionProperties? currentSectPr;
2527

2628
#if !NETFRAMEWORK
2729
static RtfToDocxConverter()
@@ -57,6 +59,13 @@ static RtfToDocxConverter()
5759
/// <param name="targetDocument"></param>
5860
public void BuildDocx(TextReader input, WordprocessingDocument targetDocument)
5961
{
62+
fontTable = new();
63+
colorTable = new();
64+
codePageEncoding = null;
65+
currentBorder = null;
66+
defaultSectPr = null;
67+
currentSectPr = null;
68+
6069
if (targetDocument.MainDocumentPart == null)
6170
targetDocument.AddMainDocumentPart();
6271

@@ -67,6 +76,23 @@ public void BuildDocx(TextReader input, WordprocessingDocument targetDocument)
6776

6877
var rtfDocument = RtfReader.ReadRtf(input);
6978
ConvertGroup(rtfDocument.Root, targetDocument.MainDocumentPart.Document.Body, targetDocument.MainDocumentPart);
79+
80+
if (!targetDocument.MainDocumentPart.Document.Body.Descendants<SectionProperties>().Any())
81+
{
82+
// If the document does not contain sections, add the default section properties as last body element,
83+
// so that it's applied by default in DOCX too.
84+
// This preserves page size and other properties if they are specified as document-level settings only (\paperw, \paperh, ...)
85+
// but no section is present.
86+
if (defaultSectPr != null)
87+
targetDocument.MainDocumentPart.Document.Body.AppendChild(defaultSectPr.CloneNode(true));
88+
}
89+
else
90+
{
91+
// If at least a section was created, add the last section properties (that was not added to a paragraph)
92+
// as last body element, so that it's applied by default to new DOCX sections.
93+
if (currentSectPr != null)
94+
targetDocument.MainDocumentPart.Document.Body.AppendChild(currentSectPr.CloneNode(true));
95+
}
7096
}
7197

7298
private void ConvertGroup(RtfGroup group, OpenXmlElement parentElement, MainDocumentPart targetDocument)
@@ -103,7 +129,7 @@ private void ConvertGroup(RtfGroup group, OpenXmlElement parentElement, MainDocu
103129
var pPr = new ParagraphProperties();
104130
Paragraph? currentParagraph = null;
105131
Run? currentRun = null;
106-
ConvertGroupInner(group, parentElement, targetDocument, fmtStack, pPr, ref currentParagraph, ref currentRun);
132+
ConvertGroupInner(group, parentElement, targetDocument, fmtStack, pPr, currentParagraph, currentRun);
107133
}
108134

109135
private FormattingState TryPeek(Stack<FormattingState> stack)
@@ -119,7 +145,7 @@ private void TryPop(Stack<FormattingState> stack)
119145
stack.Pop();
120146
}
121147

122-
private void ConvertGroupInner(RtfGroup group, OpenXmlElement parentElement, MainDocumentPart targetDocument, Stack<FormattingState> fmtStack, ParagraphProperties pPr, ref Paragraph? currentParagraph, ref Run? currentRun)
148+
private void ConvertGroupInner(RtfGroup group, OpenXmlElement parentElement, MainDocumentPart targetDocument, Stack<FormattingState> fmtStack, ParagraphProperties pPr, Paragraph? currentParagraph, Run? currentRun)
123149
{
124150
// push a clone for this group's local modifications
125151
fmtStack.Push(TryPeek(fmtStack).Clone());
@@ -133,7 +159,8 @@ private void ConvertGroupInner(RtfGroup group, OpenXmlElement parentElement, Mai
133159
{
134160
if (destination.IsIgnorable)
135161
{
136-
// This subgroup is an ignorable destination (starts with *), skip it for now
162+
// This subgroup is an ignorable destination (starts with *), skip it for now.
163+
// In the future, we should support at least listtable and listoverridetable.
137164
continue;
138165
}
139166
else
@@ -150,12 +177,36 @@ private void ConvertGroupInner(RtfGroup group, OpenXmlElement parentElement, Mai
150177
ParseColorTable(destination);
151178
continue;
152179
}
180+
else if (dname == "header")
181+
{
182+
}
183+
else if (dname == "headerf")
184+
{
185+
}
186+
else if (dname == "headerl")
187+
{
188+
}
189+
else if (dname == "headerr")
190+
{
191+
}
192+
else if (dname == "footer")
193+
{
194+
}
195+
else if (dname == "footerf")
196+
{
197+
}
198+
else if (dname == "footerl")
199+
{
200+
}
201+
else if (dname == "footerr")
202+
{
203+
}
153204
else if (dname == "upr")
154205
{
155206
// Process the Unicode group only, ignore the ANSI equivalent
156207
var udGroup = group.Tokens.OfType<RtfDestination>().FirstOrDefault(d => d.Name == "ud");
157208
if (udGroup != null)
158-
ConvertGroupInner(udGroup, parentElement, targetDocument, fmtStack, pPr, ref currentParagraph, ref currentRun);
209+
ConvertGroupInner(udGroup, parentElement, targetDocument, fmtStack, pPr, currentParagraph, currentRun);
159210

160211
continue;
161212
}
@@ -172,7 +223,7 @@ private void ConvertGroupInner(RtfGroup group, OpenXmlElement parentElement, Mai
172223
}
173224

174225
// Recurse
175-
ConvertGroupInner(subGroup, parentElement, targetDocument, fmtStack, pPr, ref currentParagraph, ref currentRun);
226+
ConvertGroupInner(subGroup, parentElement, targetDocument, fmtStack, pPr, currentParagraph, currentRun);
176227
break;
177228
case RtfControlWord cw:
178229
HandleControlWord(cw, ref currentParagraph, ref currentRun, parentElement, TryPeek(fmtStack), pPr);
@@ -181,18 +232,18 @@ private void ConvertGroupInner(RtfGroup group, OpenXmlElement parentElement, Mai
181232
// Ensure paragraph and run exist
182233
var encoding = codePageEncoding ?? Encoding.GetEncoding(CultureInfo.CurrentCulture.TextInfo.ANSICodePage);
183234
string s = encoding.GetString([ch.CharCode]);
184-
HandleText(s, ref currentParagraph, ref currentRun, parentElement, TryPeek(fmtStack), pPr);
235+
HandleText(s, currentParagraph, currentRun, parentElement, TryPeek(fmtStack), pPr);
185236
break;
186237
case RtfText text:
187-
HandleText(text.Text, ref currentParagraph, ref currentRun, parentElement, TryPeek(fmtStack), pPr);
238+
HandleText(text.Text, currentParagraph, currentRun, parentElement, TryPeek(fmtStack), pPr);
188239
break;
189240
}
190241
}
191242
// restore parent formatting state
192243
TryPop(fmtStack);
193244
}
194245

195-
private void HandleText(string text, ref Paragraph? currentParagraph, ref Run? currentRun, OpenXmlElement parentElement, FormattingState runState, ParagraphProperties pPr)
246+
private void HandleText(string text, Paragraph? currentParagraph, Run? currentRun, OpenXmlElement parentElement, FormattingState runState, ParagraphProperties pPr)
196247
{
197248
text ??= string.Empty;
198249

@@ -232,17 +283,23 @@ private void HandleControlWord(RtfControlWord cw, ref Paragraph? currentParagrap
232283
switch (name)
233284
{
234285
case "sect":
235-
// end current section
236-
// TODO
286+
// End current section
287+
if (parentElement is Body body)
288+
{
289+
EnsureParagraph(ref currentParagraph, ref currentRun, parentElement, pPr);
290+
currentParagraph!.ParagraphProperties ??= new ParagraphProperties();
291+
currentSectPr ??= new SectionProperties();
292+
currentParagraph.ParagraphProperties.SectionProperties = (SectionProperties)currentSectPr.CloneNode(true);
293+
}
237294
break;
238295
case "par":
239-
// end current paragraph
296+
// End current paragraph
240297
currentParagraph = null;
241298
currentRun = null;
242299
break;
243300

244301
case "sectd": // reset section formatting
245-
// sectionState.Clear();
302+
ResetSectionProperties();
246303
break;
247304
case "pard": // reset paragraph formatting
248305
pPr.RemoveAllChildren();
@@ -253,6 +310,7 @@ private void HandleControlWord(RtfControlWord cw, ref Paragraph? currentParagrap
253310
runState.Clear();
254311
break;
255312

313+
// RTF header
256314
case "ansi":
257315
// If ANSI is specified, use the system ANSI code page,
258316
// unless the DefaultCodePage value is set to a different value.
@@ -293,6 +351,123 @@ private void HandleControlWord(RtfControlWord cw, ref Paragraph? currentParagrap
293351
}
294352
break;
295353

354+
// Document settings
355+
case "paperw":
356+
if (cw.HasValue)
357+
{
358+
defaultSectPr ??= new SectionProperties();
359+
var pageSize = defaultSectPr.GetFirstChild<PageSize>() ?? defaultSectPr.AppendChild(new PageSize());
360+
pageSize.Width = (uint)cw.Value!.Value;
361+
}
362+
break;
363+
case "paperh":
364+
if (cw.HasValue)
365+
{
366+
defaultSectPr ??= new SectionProperties();
367+
var pageSize = defaultSectPr.GetFirstChild<PageSize>() ?? defaultSectPr.AppendChild(new PageSize());
368+
pageSize.Height = (uint)cw.Value!.Value;
369+
}
370+
break;
371+
372+
// Section properties
373+
case "lndscpsxn":
374+
if (cw.HasValue)
375+
{
376+
currentSectPr ??= new SectionProperties();
377+
var pageSize = currentSectPr.GetFirstChild<PageSize>() ?? currentSectPr.AppendChild(new PageSize());
378+
pageSize.Orient = PageOrientationValues.Landscape;
379+
}
380+
break;
381+
case "margbsxn":
382+
if (cw.HasValue)
383+
{
384+
currentSectPr ??= new SectionProperties();
385+
var pageMargin = currentSectPr.GetFirstChild<PageMargin>() ?? currentSectPr.AppendChild(new PageMargin());
386+
pageMargin.Bottom = cw.Value!.Value;
387+
}
388+
break;
389+
case "marglsxn":
390+
if (cw.HasValue)
391+
{
392+
currentSectPr ??= new SectionProperties();
393+
var pageMargin = currentSectPr.GetFirstChild<PageMargin>() ?? currentSectPr.AppendChild(new PageMargin());
394+
pageMargin.Left = (uint)cw.Value!.Value;
395+
}
396+
break;
397+
case "margrsxn":
398+
if (cw.HasValue)
399+
{
400+
currentSectPr ??= new SectionProperties();
401+
var pageMargin = currentSectPr.GetFirstChild<PageMargin>() ?? currentSectPr.AppendChild(new PageMargin());
402+
pageMargin.Right = (uint)cw.Value!.Value;
403+
}
404+
break;
405+
case "margtsxn":
406+
if (cw.HasValue)
407+
{
408+
currentSectPr ??= new SectionProperties();
409+
var pageMargin = currentSectPr.GetFirstChild<PageMargin>() ?? currentSectPr.AppendChild(new PageMargin());
410+
pageMargin.Top = cw.Value!.Value;
411+
}
412+
break;
413+
case "pgwsxn":
414+
if (cw.HasValue)
415+
{
416+
currentSectPr ??= new SectionProperties();
417+
var pageSize = currentSectPr.GetFirstChild<PageSize>() ?? currentSectPr.AppendChild(new PageSize());
418+
pageSize.Width = (uint)cw.Value!.Value;
419+
}
420+
break;
421+
case "pghsxn":
422+
if (cw.HasValue)
423+
{
424+
currentSectPr ??= new SectionProperties();
425+
var pageSize = currentSectPr.GetFirstChild<PageSize>() ?? currentSectPr.AppendChild(new PageSize());
426+
pageSize.Height = (uint)cw.Value!.Value;
427+
}
428+
break;
429+
case "sbknone":
430+
if (cw.HasValue)
431+
{
432+
currentSectPr ??= new SectionProperties();
433+
var sectionType = currentSectPr.GetFirstChild<SectionType>() ?? currentSectPr.AppendChild(new SectionType());
434+
sectionType.Val = SectionMarkValues.Continuous;
435+
}
436+
break;
437+
case "sbkcol":
438+
if (cw.HasValue)
439+
{
440+
currentSectPr ??= new SectionProperties();
441+
var sectionType = currentSectPr.GetFirstChild<SectionType>() ?? currentSectPr.AppendChild(new SectionType());
442+
sectionType.Val = SectionMarkValues.NextColumn;
443+
}
444+
break;
445+
case "sbkodd":
446+
if (cw.HasValue)
447+
{
448+
currentSectPr ??= new SectionProperties();
449+
var sectionType = currentSectPr.GetFirstChild<SectionType>() ?? currentSectPr.AppendChild(new SectionType());
450+
sectionType.Val = SectionMarkValues.OddPage;
451+
}
452+
break;
453+
case "sbkeven":
454+
if (cw.HasValue)
455+
{
456+
currentSectPr ??= new SectionProperties();
457+
var sectionType = currentSectPr.GetFirstChild<SectionType>() ?? currentSectPr.AppendChild(new SectionType());
458+
sectionType.Val = SectionMarkValues.EvenPage;
459+
}
460+
break;
461+
case "sbkpage":
462+
if (cw.HasValue)
463+
{
464+
currentSectPr ??= new SectionProperties();
465+
var sectionType = currentSectPr.GetFirstChild<SectionType>() ?? currentSectPr.AppendChild(new SectionType());
466+
sectionType.Val = SectionMarkValues.NextPage;
467+
}
468+
break;
469+
470+
// Breaks
296471
case "line":
297472
// text-wrapping line break. Avoid emitting duplicate breaks when previous token
298473
// already produced a text-wrapping break (some RTF producers emit both \line and \lbr).
@@ -341,6 +516,7 @@ private void HandleControlWord(RtfControlWord cw, ref Paragraph? currentParagrap
341516
}
342517
break;
343518

519+
// Special characters
344520
// TODO: use the current culture specified in RTF for the fallback string of chdate and chtime
345521
case "chdate":
346522
CreateField("date", DateTime.Now.ToShortDateString(), ref currentParagraph, ref currentRun, parentElement, runState, pPr);
@@ -360,7 +536,6 @@ private void HandleControlWord(RtfControlWord cw, ref Paragraph? currentParagrap
360536
case "sectnum": // TODO: keep track of the current section number and write it as fallback
361537
CreateSimpleField(" SECTION \\* MERGEFORMAT ", "1", ref currentParagraph, ref currentRun, parentElement, runState, pPr);
362538
break;
363-
364539
// TODO: create comments and footnotes/endnotes (followed by the content group)
365540
// case "chatn":
366541
// break;
@@ -412,7 +587,7 @@ private void HandleControlWord(RtfControlWord cw, ref Paragraph? currentParagrap
412587
charCode += 65536;
413588
}
414589
string s = char.ConvertFromUtf32(charCode);
415-
HandleText(s, ref currentParagraph, ref currentRun, parentElement, runState, pPr);
590+
HandleText(s, currentParagraph, currentRun, parentElement, runState, pPr);
416591
// After emitting the Unicode character, the RTF specification says that
417592
// the following "uc" ANSI characters should be ignored. Track how many
418593
// to skip on the formatting state so subsequent text tokens can consume them.
@@ -711,6 +886,7 @@ private void HandleControlWord(RtfControlWord cw, ref Paragraph? currentParagrap
711886
pPr.TextAlignment = new TextAlignment() { Val = VerticalTextAlignmentValues.Baseline };
712887
break;
713888
case "favar":
889+
case "fafixed":
714890
pPr.TextAlignment = new TextAlignment() { Val = VerticalTextAlignmentValues.Bottom };
715891
break;
716892
case "facenter":
@@ -968,18 +1144,37 @@ private void HandleControlWord(RtfControlWord cw, ref Paragraph? currentParagrap
9681144
}
9691145
}
9701146

971-
private void EnsureRun(ref Paragraph? currentParagraph, ref Run? currentRun, OpenXmlElement parentElement, FormattingState runState, ParagraphProperties pPr)
1147+
private void ResetSectionProperties()
1148+
{
1149+
if (defaultSectPr != null)
1150+
{
1151+
currentSectPr = (SectionProperties)defaultSectPr.CloneNode(true);
1152+
}
1153+
else
1154+
{
1155+
currentSectPr ??= new SectionProperties();
1156+
currentSectPr.RemoveAllChildren();
1157+
currentSectPr.ClearAllAttributes();
1158+
}
1159+
}
1160+
1161+
private void EnsureParagraph(ref Paragraph? currentParagraph, ref Run? currentRun, OpenXmlElement parentElement, ParagraphProperties pPr)
9721162
{
9731163
if (currentParagraph == null)
9741164
{
9751165
currentParagraph = CreateParagraphWithProperties(pPr);
9761166
parentElement.Append(currentParagraph);
9771167
currentRun = null;
9781168
}
1169+
}
1170+
1171+
private void EnsureRun(ref Paragraph? currentParagraph, ref Run? currentRun, OpenXmlElement parentElement, FormattingState runState, ParagraphProperties pPr)
1172+
{
1173+
EnsureParagraph(ref currentParagraph, ref currentRun, parentElement, pPr);
9791174
if (currentRun == null)
9801175
{
9811176
currentRun = CreateRunWithProperties(runState);
982-
currentParagraph.Append(currentRun);
1177+
currentParagraph!.Append(currentRun);
9831178
}
9841179
}
9851180

0 commit comments

Comments
 (0)