Selenium&EmguCV实现爬虫图片识别

概述

爬虫需要抓取网站价格，与一般抓取网页区别的是抓取内容是通过AJAX加载，并且价格是通过CSS背景图片显示的。

每一个数字对应一个样式，如'p_h57_5'

.p_h57_5 {

background: url('http://pic.c-ctrip.com/priceblur/h57/3713de5c594648529f39d031243966dd.gif') no-repeat -590px;

padding: 0 6px;

font-size: 18px;

}

数字对应的样式和对应的backgroundimg都是动态改变的，需要获取到每一个房型的房价。虽然后来有了其它渠道获取房价，这里记录一下用Selenium&Emgu抓取的方式。

流程：

1.Selenium访问网址

2.全屏截图

3.Selenium选择器获取房型等信息

4.Selenium选择器获取价格DOM元素，计算出价格元素的相对位置，截取价格图片，使用Emgu识别价格并且输出

实现

static void Main(string[] args)

        {

            //访问网址

            ChromeOptions options = new ChromeOptions();

            options.AddArguments("--start-maximized --disable-popup-blocking");

            var driver = new ChromeDriver(options);

            driver.Navigate().GoToUrl("http://hotels.ctrip.com/hotel/992765.html");

       try

            {

                new WebDriverWait(driver, TimeSpan.FromSeconds(1)).Until(

                    ExpectedConditions.ElementExists((By.ClassName("htl_room_table")))); //表示已加载完毕

            }

            finally

            {

            }

            //删除价格的¥符号

            ReadOnlyCollection<IWebElement> elementsList = driver.FindElementsByCssSelector("tr[expand]");

            driver.ExecuteScript(@"

                var arr =  document.getElementsByTagName('dfn');

                for(var i=0;i<arr.length;i++){

                    arr[i].style.display = 'none';

                }

            ");

            //全屏截图

            var image2 = GetEntereScreenshot(driver);

            image2.Save(@"Z:\111.jpg");

            //输出

            Console.WriteLine("{0,-20}{1,-20}{2,-20}", "房型", "类型", "房价");

            foreach (IWebElement _ in elementsList)

            {

                //var image = _.Snapshot();

                //image.Save(@"Z:\" + Guid.NewGuid() + ".jpg");

                //var str = ORC_((Bitmap)image);

                var roomType = "";

                try

                {

                    roomType = _.FindElement(By.CssSelector(".room_unfold")).Text;

                }

                catch (Exception)

                {

                }

                var roomTypeText = regRoomType.Match(roomType);

                var roomTypeName = _.FindElement(By.CssSelector("span.room_type_name")).Text;

                //价格元素生成图片

                var image = _.FindElement(By.CssSelector("span.base_price")).SnapshotV2(image2);

                //识别

                var price = ORC_((Bitmap)image);

                Console.WriteLine("{0,-20}{1,-20}{2,-20}", roomTypeText.Value, roomTypeName, price);

            }

            Console.Read();

        }

图片识别方法



static Program()

        {

            _ocr.SetVariable("tessedit_char_whitelist", "0123456789");

        }

        private static Tesseract _ocr = new Tesseract(@"C:\Emgu\emgucv-windows-universal-cuda 2.9.0.1922\bin\tessdata", "eng", Tesseract.OcrEngineMode.OEM_TESSERACT_CUBE_COMBINED);

        //传入图片进行识别

        public static string ORC_(Bitmap img)

        {

            //""标示OCR识别调用失败

            string re = "";

            if (img == null)

                return re;

            else

            {

                Bgr drawColor = new Bgr(Color.Blue);

                try

                {

                    Image<Bgr, Byte> image = new Image<Bgr, byte>(img);

                    using (Image<Gray, byte> gray = image.Convert<Gray, Byte>())

                    {

                        _ocr.Recognize(gray);

                        Tesseract.Charactor[] charactors = _ocr.GetCharactors();

                        foreach (Tesseract.Charactor c in charactors)

                        {

                            image.Draw(c.Region, drawColor, 1);

                        }

                        re = _ocr.GetText();

                    }

                    return re;

                }

                catch (Exception ex)

                {

                    return re;

                }

            }

        }

Selenium内置了截图方法，只能截取浏览器中显示的内容，找到一个全屏截图的方式（内置截图+控制滚动条,图片拼接）



 public static Bitmap GetEntereScreenshot(IWebDriver _driver)

        {

            Bitmap stitchedImage = null;

            try

            {

                long totalwidth1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.offsetWidth");//documentElement.scrollWidth");

                long totalHeight1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return  document.body.parentNode.scrollHeight");

                int totalWidth = (int)totalwidth1;

                int totalHeight = (int)totalHeight1;

                // Get the Size of the Viewport

                long viewportWidth1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return document.body.clientWidth");//documentElement.scrollWidth");

                long viewportHeight1 = (long)((IJavaScriptExecutor)_driver).ExecuteScript("return window.innerHeight");//documentElement.scrollWidth");

                int viewportWidth = (int)viewportWidth1;

                int viewportHeight = (int)viewportHeight1;

                // Split the Screen in multiple Rectangles

                List<Rectangle> rectangles = new List<Rectangle>();

                // Loop until the Total Height is reached

                for (int i = 0; i < totalHeight; i += viewportHeight)

                {

                    int newHeight = viewportHeight;

                    // Fix if the Height of the Element is too big

                    if (i + viewportHeight > totalHeight)

                    {

                        newHeight = totalHeight - i;

                    }

                    // Loop until the Total Width is reached

                    for (int ii = 0; ii < totalWidth; ii += viewportWidth)

                    {

                        int newWidth = viewportWidth;

                        // Fix if the Width of the Element is too big

                        if (ii + viewportWidth > totalWidth)

                        {

                            newWidth = totalWidth - ii;

                        }

                        // Create and add the Rectangle

                        Rectangle currRect = new Rectangle(ii, i, newWidth, newHeight);

                        rectangles.Add(currRect);

                    }

                }

                // Build the Image

                stitchedImage = new Bitmap(totalWidth, totalHeight);

                // Get all Screenshots and stitch them together

                Rectangle previous = Rectangle.Empty;

                foreach (var rectangle in rectangles)

                {

                    // Calculate the Scrolling (if needed)

                    if (previous != Rectangle.Empty)

                    {

                        int xDiff = rectangle.Right - previous.Right;

                        int yDiff = rectangle.Bottom - previous.Bottom;

                        // Scroll

                        //selenium.RunScript(String.Format("window.scrollBy({0}, {1})", xDiff, yDiff));

                        ((IJavaScriptExecutor)_driver).ExecuteScript(String.Format("window.scrollBy({0}, {1})", xDiff, yDiff));

                        System.Threading.Thread.Sleep(200);

                    }

                    // Take Screenshot

                    var screenshot = ((ITakesScreenshot)_driver).GetScreenshot();

                    // Build an Image out of the Screenshot

                    Image screenshotImage;

                    using (MemoryStream memStream = new MemoryStream(screenshot.AsByteArray))

                    {

                        screenshotImage = Image.FromStream(memStream);

                    }

                    // Calculate the Source Rectangle

                    Rectangle sourceRectangle = new Rectangle(viewportWidth - rectangle.Width, viewportHeight - rectangle.Height, rectangle.Width, rectangle.Height);

                    // Copy the Image

                    using (Graphics g = Graphics.FromImage(stitchedImage))

                    {

                        g.DrawImage(screenshotImage, rectangle, sourceRectangle, GraphicsUnit.Pixel);

                    }

                    // Set the Previous Rectangle

                    previous = rectangle;

                }

            }

            catch (Exception ex)

            {

                // handle

            }

            return stitchedImage;

        }

最后的是根据传入的元素和全屏截图，获取到价格元素的图片



 public static Image SnapshotV2(this IWebElement element, Bitmap bitmap)

        {

            Size size = new Size(

                   Math.Min(element.Size.Width, bitmap.Width),

                   Math.Min(element.Size.Height, bitmap.Height));

            Rectangle crop = new Rectangle(element.Location, size);

            return bitmap.Clone(crop, bitmap.PixelFormat);

        }

运行效果如下

Selenium&EmguCV实现爬虫图片识别

秒客网

Selenium&EmguCV实现爬虫图片识别

概述

实现

相关文章