using Spire.Doc; using Spire.Doc.Documents; using Spire.Doc.Fields; using System; using System.Collections; using System.Collections.Generic; using System.Drawing; using System.IO; using System.Text; using System.Text.RegularExpressions; using System.Xml; namespace Edu.Common.Plugin { /// <summary> /// Word帮助类 /// </summary> public class WordHelper { /// <summary> /// Spire.Doc解析Word /// </summary> /// <param name="filePath"></param> /// <returns></returns> public static List<QuestionWordData> GetWordData(string filePath,out List<QuestionTitleData> htmlList) { List<QuestionWordData> list = new List<QuestionWordData>(); Spire.Doc.Document document = new Spire.Doc.Document(); string basepath = AppContext.BaseDirectory; string tempPath = basepath + "\\upfile\\temporary\\" + DateTime.Now.ToString("yyyyMMdd") + "\\html\\"; document.LoadFromFile(filePath); if (!Directory.Exists(tempPath)) { Directory.CreateDirectory(tempPath); } string htmlName = tempPath + DateTime.Now.Ticks + ".html"; document.SaveToFile(htmlName, FileFormat.Html); htmlList = new List<QuestionTitleData>(); try { Stream myStream = new FileStream(htmlName, FileMode.Open); Encoding encode = System.Text.Encoding.GetEncoding("utf-8");//若是格式为utf-8的需要将gb2312替换 StreamReader myStreamReader = new StreamReader(myStream, encode); string strhtml = myStreamReader.ReadToEnd(); string htmlRule = @"<p.*?>(.*?)</p>"; MatchCollection p_matchs =Regex.Matches(strhtml, htmlRule); foreach (var item in p_matchs) { if (item != null && !string.IsNullOrEmpty(item.ToString()) && item.ToString().Contains("underline")) { string spanRule = @"<span[^>]+>([^<]+)</span>"; MatchCollection span_matchs= Regex.Matches(item.ToString(), spanRule); string key = ""; List<string> contentList = new List<string>(); for (var i = 0; i < span_matchs.Count; i++) { var subItem = span_matchs[i]; string newStr = Common.Plugin.HtmlHelper.StripHT(subItem.ToString()); if (subItem != null && !string.IsNullOrEmpty(subItem.ToString()) && subItem.ToString().Contains("underline")) { key = newStr; if (i > 0) { string upContent = Common.Plugin.HtmlHelper.StripHT(span_matchs[i - 1].ToString()); if (!string.IsNullOrEmpty(upContent)&& upContent.Length>1) { contentList.Add(upContent.TrimStart('、')); } } if ((i + 1) < span_matchs.Count) { string nextContent = Common.Plugin.HtmlHelper.StripHT(span_matchs[i + 1].ToString()); if (!string.IsNullOrEmpty(nextContent)&&nextContent.Length>1) { contentList.Add(nextContent.TrimStart('、')); } } if (!string.IsNullOrEmpty(key) && contentList != null && contentList.Count > 0) { htmlList.Add(new QuestionTitleData() { UnderLine = key, BeforeAfter = contentList }); } } } } } } catch { } List<string> imageList = new List<string>(); foreach (Section section in document.Sections) { foreach (Paragraph paragraph in section.Paragraphs) { string para = paragraph.Text; foreach (DocumentObject docObject in paragraph.ChildObjects) { if ( docObject.DocumentObjectType == DocumentObjectType.Picture) { DocPicture pic = docObject as DocPicture; string imageName = string.Format(@"WordImage_{0}.png", DateTime.Now.Ticks); using MemoryStream ms = new MemoryStream(pic.ImageBytes); Image outputImg = Image.FromStream(ms); string tempImagePath = tempPath + imageName; imageList.Add(tempImagePath); outputImg.Save(tempImagePath, System.Drawing.Imaging.ImageFormat.Png); } } } } string str = document.GetText(); str = str.Replace("Evaluation Warning: The document was created with Spire.Doc for .NET.", ""); //解析大类 string categoryRule1 = @"[第][一二三四五六七八九十][部][分]"; Regex categoryReg = new Regex(categoryRule1); string[] tempArray = null; if (categoryReg.IsMatch(str)) { tempArray = Regex.Split(str, categoryRule1); if (tempArray != null && tempArray.Length > 0) { foreach (var item in tempArray) { if (!string.IsNullOrEmpty(item)) { var model = AnalysisQuestionCategory(item, imageList,isBigType: true); if (model != null && !string.IsNullOrEmpty(model.BigTitle)) { list.Add(model); } } } } } else { list.Add(AnalysisQuestionCategory(str, imageList)); } return list; } /// <summary> /// 解析问题大类下面的问题列表 /// </summary> /// <param name="questionStr"></param> /// <param name="isBigType">是否是大类</param> /// <returns></returns> private static QuestionWordData AnalysisQuestionCategory(string questionStr, List<string> imageList,bool isBigType = false) { QuestionWordData questionWordData = new QuestionWordData(); // \r\n【图一】\r\n string imgPattern = @"[【][图][一二三四五六七八九十]*[】]"; Regex imgReg = new Regex(imgPattern); //判断是否有图片 if (imgReg.IsMatch(questionStr)) { //匹配需要替换的图片 var imgMatchs = Regex.Matches(questionStr, imgPattern); if (imgMatchs != null && imgMatchs.Count > 0) { //"【图十】【图三】" foreach (var item in imgMatchs) { //获取文档中“【图一】、【图二】...”中的“一、二” var numStr = item.ToString().Replace("【", "").Replace("】", "").Replace("图", "").Trim(); //将“一、二”转成数字"1,2" var newNumber = ConvertHelper.ConvertStringToNumber(numStr); if (newNumber > 0) { string filePath = imageList != null && imageList.Count >= newNumber ? imageList[newNumber - 1] : ""; if (!string.IsNullOrEmpty(filePath)) { string imageBase64 = ConvertHelper.ConvertImageToBase64(filePath); if (!string.IsNullOrEmpty(imageBase64)) { questionStr = questionStr.Replace(item.ToString(), ConvertHelper.GetImageString(imageBase64)); } } } } } } string[] tempArray = null; //“\r\n”开始+“1234567890”出现1次到多次+“、或.”结尾 (\r\n[1234567890]*[、.]) string pattern1 = @"\r\n[12345678901234567890]*[、]"; Regex reg1 = new Regex(pattern1); if (isBigType) { //注意“[ ]”为word里面的空格(出现0-N次) string pattern2 = @"[\r\n][ ]*[((][一二三四五六七八九十]*[))]"; Regex reg2 = new Regex(pattern2); if (reg2.IsMatch(questionStr)) { tempArray = Regex.Split(questionStr, pattern2); } else if (reg1.IsMatch(questionStr)) { tempArray = Regex.Split(questionStr, pattern1); } if (tempArray != null && tempArray.Length > 0) { string str = tempArray[0].Trim().Replace("、", "").Replace(".", ""); questionWordData.BigTitle = str; ArrayList al = new ArrayList(tempArray); al.RemoveAt(0); var newQuestionArray = (string[])al.ToArray(typeof(string)); questionWordData.QuestionList = AnalysisQuestionList(newQuestionArray); } } else { if (reg1.IsMatch(questionStr)) { tempArray = Regex.Split(questionStr, pattern1); } questionWordData.BigTitle = "默认"; questionWordData.QuestionList = AnalysisQuestionList(tempArray); } return questionWordData; } /// <summary> /// 解析问题列表 /// </summary> /// <param name="tempArray"></param> /// <returns></returns> private static List<string> AnalysisQuestionList(string[] tempArray) { List<string> list = new List<string>(); if (tempArray != null && tempArray.Length > 0) { foreach (var item in tempArray) { if (!string.IsNullOrEmpty(item)) { list.Add(item); } } } return list; } } /// <summary> /// 问题数据 /// </summary> public class QuestionWordData { /// <summary> /// 大类名称 /// </summary> public string BigTitle { get; set; } /// <summary> /// 问题列表 /// </summary> public List<string> QuestionList { get; set; } } /// <summary> /// 替换标题下划线实体 /// </summary> public class QuestionTitleData { /// <summary> /// 下划线内容 /// </summary> public string UnderLine { get; set; } /// <summary> /// 下划线内容前后文字 /// </summary> public List<string> BeforeAfter { get; set; } } }