c#抽取pdf文档标题（4）——机器学习以及决策树

我的一位同事告诉我，pdf抽取标题，用机器学习可以完美解决问题，抽取的准确率比较高。于是，我看了一些资料，就动起手来，实践了下。

我主要是根据以往历史块的特征生成一个决策树，然后利用这棵决策树，去判断一个新的块到底是不是标题。理论上，历史块的数量越庞大，那么结果越准确。其实经过实践不是这样的，我觉得影响结果判断的因素越少，而且库的数量达到一定数量后，判断越准确。这个记录块信息的历史库，就是供计算机学习的原料。

首先看下，如何形成一个决策树？

  private static DecisionTreeID3<string> BuildTree()

         {

             //var blockList = Tools.SelectList("/config/Blocks/Block");

             var blockList = DBHelper.Select<BlockData>();

             string[,] da = new string[blockList.Count, ];

             for (int i = ; i < blockList.Count; i++)

             {

                 var index = blockList[i].Index;

                 if (index >=  && index <= )

                 {

                     da[i, ] = "high";

                 }

                 else if (index >=  && index <= )

                 {

                     da[i, ] = "middle";

                 }

                 else

                 {

                     da[i, ] = "low";

                 }

                 var space = blockList[i].Space.ToString() == "非数字" ?  : (int)blockList[i].Space;

                 if (space >=  && space <=  || space >=  && space <= )

                 {

                     da[i, ] = "high";

                 }

                 else if (space >=  && space <= )

                 {

                     da[i, ] = "middle";

                 }

                 else

                 {

                     da[i, ] = "low";

                 }

                 var xSize = blockList[i].XSize;

                 if (xSize >=  && xSize <=  || xSize >=  && xSize <=  || xSize >=  && xSize <= )

                 {

                     da[i, ] = "high";

                 }

                 else

                 {

                     da[i, ] = "low";

                 }

                 var ySize = blockList[i].YSize;

                 if (ySize >=  && ySize <=  || ySize >=  && ySize <=  || ySize >=  && ySize <= )

                 {

                     da[i, ] = "high";

                 }

                 else

                 {

                     da[i, ] = "low";

                 }

                 var height = (int)blockList[i].Height;

                 if (height >=  && height <=  || height >=  && height <= )

                 {

                     da[i, ] = "high";

                 }

                 else

                 {

                     da[i, ] = "low";

                 }

                 da[i, ] = blockList[i].IsTitle.ToString();

             }

             var names = new string[] { "Index", "Space", "XSize", "YSize", "Height", "IsTitle" };

             var tree = new DecisionTreeID3<string>(da, names, new string[] { "True", "False" });

             tree.Learn();

             return tree;

         }

把数据库中的块信息，通过转换，变成二维数组，而且每个特征值被转为离散的值，之前的值是几乎连续的值，它有多少个，无法确定，转为离散的值，才能控制决策树的规模。下面，我们看看决策树类 DecisionTreeID3：

  public class DecisionTreeID3<T> where T : IEquatable<T>

     {

         T[,] Data;

         string[] Names;

         int Category;

         T[] CategoryLabels;

         public DecisionTreeNode<T> Root;

         public DecisionTreeID3(T[,] data, string[] names, T[] categoryLabels)

         {

             Data = data;

             Names = names;

             Category = data.GetLength() - ;//类别变量需要放在最后一列

             CategoryLabels = categoryLabels;

         }

         public void Learn()

         {

             int nRows = Data.GetLength();

             int nCols = Data.GetLength();

             int[] rows = new int[nRows];

             int[] cols = new int[nCols];

             for (int i = ; i < nRows; i++) rows[i] = i;

             for (int i = ; i < nCols; i++) cols[i] = i;

             Root = new DecisionTreeNode<T>(-, default(T));

             Learn(rows, cols, Root);

             DisplayNode(Root);

         }

         public bool Search(string[] test, DecisionTreeNode<T> Node = null)

         {

             bool isResult = false;

             if (Node == null) Node = Root;

             foreach (var item in Node.Children)

             {

                 var label = item.Label;

                 if (label < test.Length -  && test[label] != item.Value.ToString()) continue;

                 else

                 {

                     if (label == test.Length -  && item.Value.ToString() == "True")

                     {

                         isResult = true;

                         return isResult;

                     }

                     else

                     {

                         isResult = Search(test, item);

                     }

                 }

             }

             return isResult;

         }

         public StringBuilder sb = new StringBuilder();

         public void DisplayNode(DecisionTreeNode<T> Node, int depth = )

         {

             if (Node.Label != -)

             {

                 string nodeStr = string.Format("{0} {1}: {2}", new string('-', depth * ), Names[Node.Label], Node.Value);

                 sb.AppendLine(nodeStr);

             }

             foreach (var item in Node.Children)

                 DisplayNode(item, depth + );

         }

         private void Learn(int[] pnRows, int[] pnCols, DecisionTreeNode<T> Root, int depth = )

         {

             var categoryValues = GetAttribute(Data, Category, pnRows);

             var categoryCount = categoryValues.Distinct().Count();

             if (categoryCount == )

             {

                 var node = new DecisionTreeNode<T>(Category, categoryValues.First());

                 Root.Children.Add(node);

             }

             else

             {

                 if (depth > ) return;

                 if (pnRows.Length == ) return;

                 else if (pnCols.Length == )

                 {

                     //投票～

                     //多数票表决制

                     var Vote = categoryValues.GroupBy(i => i).OrderBy(i => i.Count()).First();

                     var node = new DecisionTreeNode<T>(Category, Vote.First());

                     Root.Children.Add(node);

                 }

                 else

                 {

                     //var maxCol = MaxEntropy(pnRows, pnCols);

                     //按c4.5算法

                     var maxCol = MaxEntropyRate(pnRows, pnCols);

                     var attributes = GetAttribute(Data, maxCol, pnRows).Distinct();

                     string currentPrefix = Names[maxCol];

                     foreach (var attr in attributes)

                     {

                         int[] rows = pnRows.Where(irow => Data[irow, maxCol].Equals(attr)).ToArray();

                         int[] cols = pnCols.Where(i => i != maxCol).ToArray();

                         var node = new DecisionTreeNode<T>(maxCol, attr);

                         Root.Children.Add(node);

                         Learn(rows, cols, node, depth + );//递归生成决策树

                     }

                 }

             }

         }

         public double AttributeInfo(int attrCol, int[] pnRows)

         {

             var tuples = AttributeCount(attrCol, pnRows);

             var sum = (double)pnRows.Length;

             double Entropy = 0.0;

             foreach (var tuple in tuples)

             {

                 int[] count = new int[CategoryLabels.Length];

                 foreach (var irow in pnRows)

                     if (Data[irow, attrCol].Equals(tuple.Item1))

                     {

                         int index = Array.IndexOf(CategoryLabels, Data[irow, Category]);

                         count[index]++;//目前仅支持类别变量在最后一列

                     }

                 double k = 0.0;

                 for (int i = ; i < count.Length; i++)

                 {

                     double frequency = count[i] / (double)tuple.Item2;

                     double t = -frequency * Log2(frequency);

                     k += t;

                 }

                 double freq = tuple.Item2 / sum;

                 Entropy += freq * k;

             }

             return Entropy;

         }

         public double AttributeInfoRate(int attrCol, int[] pnRows)

         {

             var tuples = AttributeCount(attrCol, pnRows);

             var sum = (double)pnRows.Length;

             double SplitE = 0.0;

             foreach (var tuple in tuples)

             {

                 double frequency = tuple.Item2 / (double)sum;

                 double t = -frequency * Log2(frequency);

                 SplitE += t;

             }

             return SplitE;

         }

         public double CategoryInfo(int[] pnRows)

         {

             var tuples = AttributeCount(Category, pnRows);

             var sum = (double)pnRows.Length;

             double Entropy = 0.0;

             foreach (var tuple in tuples)

             {

                 double frequency = tuple.Item2 / sum;

                 double t = -frequency * Log2(frequency);

                 Entropy += t;

             }

             return Entropy;

         }

         private static IEnumerable<T> GetAttribute(T[,] data, int col, int[] pnRows)

         {

             foreach (var irow in pnRows)

                 yield return data[irow, col];

         }

         private static double Log2(double x)

         {

             return x == 0.0 ? 0.0 : Math.Log(x, 2.0);

         }

         /// <summary>

         /// 计算增益率

         /// </summary>

         /// <param name="pnRows"></param>

         /// <param name="pnCols"></param>

         /// <returns></returns>

         public int MaxEntropy(int[] pnRows, int[] pnCols)

         {

             double cateEntropy = CategoryInfo(pnRows);

             int maxAttr = ;

             double max = double.MinValue;

             foreach (var icol in pnCols)

                 if (icol != Category)

                 {

                     double Gain = cateEntropy - AttributeInfo(icol, pnRows);

                     if (max < Gain)

                     {

                         max = Gain;

                         maxAttr = icol;

                     }

                 }

             return maxAttr;

         }

         /// <summary>

         /// 计算增益率最大的属性

         /// </summary>

         /// <param name="pnRows"></param>

         /// <param name="pnCols"></param>

         /// <returns></returns>

         public int MaxEntropyRate(int[] pnRows, int[] pnCols)

         {

             double cateEntropy = CategoryInfo(pnRows);

             int maxAttr = ;

             double max = double.MinValue;

             foreach (var icol in pnCols)

                 if (icol != Category)

                 {

                     double Gain = cateEntropy - AttributeInfo(icol, pnRows);

                     double SplitE = AttributeInfoRate(icol, pnRows);

                     double GrainRation = Gain / SplitE;

                     if (max < GrainRation)

                     {

                         max = GrainRation;

                         maxAttr = icol;

                     }

                 }

             return maxAttr;

         }

         public IEnumerable<Tuple<T, int>> AttributeCount(int col, int[] pnRows)

         {

             var tuples = from n in GetAttribute(Data, col, pnRows)

                          group n by n into i

                          select Tuple.Create(i.First(), i.Count());

             return tuples;

         }

     }

     public sealed class DecisionTreeNode<T>

     {

         public int Label { get; set; }

         public T Value { get; set; }

         public List<DecisionTreeNode<T>> Children { get; set; }

         public DecisionTreeNode(int label, T value)

         {

             Label = label;

             Value = value;

             Children = new List<DecisionTreeNode<T>>();

         }

     }

这个类里面包含着两个算法，C4.5和ID3，C4.5是在ID3的基础上进行改进的一种算法。我采取了C4.5的算法，在94行。C4.5 算法，是用信息增益率来选择属性。ID3选择属性用的是子树的信息增益，这里可以用很多方法来定义信息，ID3使用的是熵（entropy，熵是一种不纯度度量准则），也就是熵的变化值，而C4.5用的是信息增益率。此处信息量比较大，可以参考 http://shiyanjun.cn/archives/428.html 这篇文章。

决策树建好后，我们开始调用：

            var tree = BuildTree();

            //打印树

             tree.sb.ToString();

             //用树来预测

             var test = new string[] { "True", "False", "True", "False", "False", "" };

             bool isTitle = tree.Search(test);

第三行，是把树型结构输出来，最后两行是判断一个块信息是否是标题。这个数组当然也是数值转换为离散值后的结果。

有一点必须得明确，就是决策树得剪裁，否则有可能导致内存泄漏。决策类中的78行，如果树的层次结构超过了10层，就停止生长了。其实在规则过滤和决策树预测，我选择了规则过滤，因为用决策树的结果，经测试，准确率并不高，有可能是我才开始用，没有把握精髓，所以我保守选择。

c#抽取pdf文档标题（4）——机器学习以及决策树的更多相关文章

c&num;抽取pdf文档标题——前言
由于工作的需要,研究c#抽取pdf文档标题有3个月了.这项工作是一项"伟大而艰巨"的任务.应该是我目前研究工作中最长的一次.我觉得在长时间忙碌后,应该找些时间,把自己的心路历程归纳 ...
c&num;抽取pdf文档标题（3）
上一篇介绍了整体流程以及利用库读取pdf内容形成字符集合.这篇着重介绍下,过滤规则,毕竟我们是使用规则过滤,最后得到标题的. 首先看归一化处理,什么是归一化呢?就是使结果始终处于0-1之间(包括0,1 ...
c&num;抽取pdf文档标题（1）
首先看看我的项目结构: 从上面的结果图中,我们可以看出,主要用了两个库:itextsharp.dll 和 pdfbox-1.8.9.dll,dll文件夹存放引用的库,handles文件夹存放抽取的处理 ...
c&num;抽取pdf文档标题（2）
public class IETitle { public static List<WordInfo> WordsInfo = new List<WordInfo>(); pr ...
Python处理Excel和PDF文档
一.使用Python操作Excel Python来操作Excel文档以及如何利用Python语言的函数和表达式操纵Excel文档中的数据. 虽然微软公司本身提供了一些函数,我们可以使用这些函数操作Ex ...
C&num;给PDF文档添加文本和图片页眉
页眉常用于显示文档的附加信息,我们可以在页眉中插入文本或者图形,例如,页码.日期.公司徽标.文档标题.文件名或作者名等等.那么我们如何以编程的方式添加页眉呢?今天,这篇文章向大家分享如何使用了免费组件 ...
将w3cplus网站中的文章页面提取并导出为pdf文档
最近在看一些关于CSS3方面的知识,主要是平时看到网页中有很多用CSS3实现的很炫的效果,所以就打算系统的学习一下.在网上找到很多的文章,但都没有一个好的整理性,比较凌乱.昨天看到w3cplus网站中 ...
PDF2SWF转换只有一页的PDF文档，在FlexPaper不显示解决方法
问题:PDF2SWF转换只有一页的PDF文档,在FlexPaper不显示! FlexPaper 与 PDF2SWF 结合是解决在线阅读PDF格式文件的问题的,多页的PDF文件转换可以正常显示,只有一页 ...
【PDF】java使用Itext生成pdf文档--详解
[API接口] 一.Itext简介 API地址:javadoc/index.html:如 D:/MyJAR/原JAR包/PDF/itext-5.5.3/itextpdf-5.5.3-javadoc/ ...

随机推荐

PHP合并2个数字键数组的值
先要了解一个基础知识点:PHP数组合并+与array_merge的区别分析 & 对多个数组合并去重技巧 <?php /** * PHP合并2个数字键数组的值 * * @param arr ...
Linear Algebra lecture1 note
Professor: Gilbert Strang Text: Introduction to Linear Algebra http://web.mit.edu/18.06 Lecture 1 ...
587A
#include<iostream> #include<algorithm> #include<stdio.h> #include<stdlib.h> ...
尝试制作了一个Panorama
照片拍的不太好, 效果如下(浏览器需要支持WebGL), 您可以用鼠标拖动浏览: //
【暑假】[实用数据结构]UVAlive 3027 Corporative Network
UVAlive 3027 Corporative Network 题目: Corporative Network Time Limit: 3000MS Memory Limit: 30000K ...
JS浮点类型计算
/* ---------------- JS浮点数运算重置 ---------------- */ //加法函数 //调用:accAdd(arg1,arg2) //返回值:arg1加上arg2的精确结 ...
ASPNET5的依赖注入
ASP.NET5设计的时候就是以DI为基础的,它可以利用内建的框架在Startup类的方法中,把依赖注入进去.应用服务也可以被配置的注入.默认的服务容器提供一些基本的功能,它并不打算代替现代主流的DI ...
Heapsort 堆排序算法详解（Java实现)
Heapsort (堆排序)是最经典的排序算法之一,在google或者百度中搜一下可以搜到很多非常详细的解析.同样好的排序算法还有quicksort(快速排序)和merge sort(归并排序),选择 ...
winform socket编程之TCPListener
运行结果: 服务端代码 using System; using System.Collections.Generic; using System.ComponentModel; using Syste ...
Python——三级菜单
#三级菜单函数 menu = { '北京':{ 海淀:{ '五道口':{} '中关村':{} '上帝':{} } '昌平':{} '朝阳':{} '东城':{} }, '上海':{} '山东':{} ...