feat: Add support for "word" files exported from Confluence (#1600)

* Display error message to end user

* fix: Improve conversion of tables

* fix: Characters at ends of lines in tables lost
This commit is contained in:
Tom Moor
2020-10-21 08:53:59 -07:00
committed by GitHub
parent b78e2f1e05
commit df7b9f3e88
9 changed files with 394 additions and 3 deletions

View File

@ -1,11 +1,14 @@
// @flow
import fs from "fs";
import File from "formidable/lib/file";
import { strikethrough, tables } from "joplin-turndown-plugin-gfm";
import mammoth from "mammoth";
import quotedPrintable from "quoted-printable";
import TurndownService from "turndown";
import utf8 from "utf8";
import uuid from "uuid";
import parseTitle from "../../shared/utils/parseTitle";
import { InvalidRequestError } from "../errors";
import { FileImportError, InvalidRequestError } from "../errors";
import { Attachment, Event, User } from "../models";
import dataURItoBuffer from "../utils/dataURItoBuffer";
import parseImages from "../utils/parseImages";
@ -18,12 +21,28 @@ const turndownService = new TurndownService({
headingStyle: "atx",
});
// Use the GitHub-flavored markdown plugin to parse
// strikethoughs and tables
turndownService
.use(strikethrough)
.use(tables)
.addRule("breaks", {
filter: ["br"],
replacement: function (content) {
return "\n";
},
});
interface ImportableFile {
type: string;
getMarkdown: (file: any) => Promise<string>;
}
const importMapping: ImportableFile[] = [
{
type: "application/msword",
getMarkdown: confluenceToMarkdown,
},
{
type:
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -57,6 +76,60 @@ async function htmlToMarkdown(file): Promise<string> {
return turndownService.turndown(value);
}
async function confluenceToMarkdown(file): Promise<string> {
let value = await fs.promises.readFile(file.path, "utf8");
// We're only supporting the ridiculous output from Confluence here, regular
// Word documents should call into the docxToMarkdown importer.
// See: https://jira.atlassian.com/browse/CONFSERVER-38237
if (!value.includes("Content-Type: multipart/related")) {
throw new FileImportError("Unsupported Word file");
}
// get boundary marker
const boundaryMarker = value.match(/boundary="(.+)"/);
if (!boundaryMarker) {
throw new FileImportError("Unsupported Word file (No boundary marker)");
}
// get content between multipart boundaries
let boundaryReached = 0;
const lines = value.split("\n").filter((line) => {
if (line.includes(boundaryMarker[1])) {
boundaryReached++;
return false;
}
if (line.startsWith("Content-")) {
return false;
}
// 1 == definition
// 2 == content
// 3 == ending
if (boundaryReached === 2) {
return true;
}
return false;
});
if (!lines.length) {
throw new FileImportError("Unsupported Word file (No content found)");
}
// Mime attachment is "quoted printable" encoded, must be decoded first
// https://en.wikipedia.org/wiki/Quoted-printable
value = utf8.decode(quotedPrintable.decode(lines.join("\n")));
// If we don't remove the title here it becomes printed in the document
// body by turndown
turndownService.remove(["style", "xml", "title"]);
// Now we should have something that looks like HTML
const html = turndownService.turndown(value);
return html.replace(/<br>/g, " \\n ");
}
export default async function documentImporter({
file,
user,