如何在X射线(NodeJS抓取库)响应中修复编码?

时间:2021-04-20 20:34:46

The following script is working perfectly in my NodeJS server, but rarely it returns response like this, when I'm trying to scrape some Cyrillic websites.

以下脚本在我的NodeJS服务器中运行得很好,但是当我试图刮掉一些西里尔网站时,很少会返回这样的响应。

Script

x(url, {
    name: 'title',
    ogDescription: 'meta[property="og:description"]@content',
    metaDescription: 'meta[name="description"]@content',
        ogImage: 'meta[property="og:image"]@content',
        twitterImage: 'meta[name="name="twitter:image:src""]@content',
    metaImage: 'meta[name="image"]@content',
    headImage: 'head img@src',
    contentImage_1: '.content img@src',
    contentImage_2: '.image img@src'
  })
(function (err, obj) {
    var firstData = {
        name: [
            obj.name
        ],
        description: [
            obj.metaDescription, 
            obj.ogDescription,
        ],
        image: [
            obj.ogImage,
            obj.twitterImage,
            obj.metaImage,
            obj.headImage,
            obj.contentImage_1,
            obj.contentImage_2
        ]
    }

Example of response with incorrect encoding

编码错误的响应示例

firstData { name: [ '(Rock, Pop) [15LP] [24/96] Queen - Studio Collection - 2015, 
                     FLAC (tracks) :: RuTracker.org' ],
  description:
   [ 'RuTracker.org » ���������� ��� (����������� ���������) » 
                      ������� ������� (Rock, Pop) [15LP] [24/96] Queen - 
                      Studio Collection - 2015, FLAC (tracks)',
                      undefined ],
  image: [ undefined, undefined, undefined, undefined, undefined, undefined ] }

How do I fix this?

我该如何解决?

1 个解决方案

#1


0  

you can use request as x-ray's driver and iconv the body in it like this:

你可以使用请求作为x-ray的驱动程序,并在其中使用iconv,如下所示:

var options = {};
var conv = null;
options.encoding = 'binary';
iconv = new require('iconv').Iconv('Windows-1251', 'utf8');
conv = function(body) {
    if (!body) return body;
    body = new Buffer.from(body, 'binary');
    return iconv.convert(body).toString();
}

var request = require('request').defaults(options);
var driver = function driver(context, callback) {
    var url = context.url;
    request(url, function(err, response, body) {
        if (!err && conv) body = conv(body);
        return callback(err, body);
    })
};
x.driver(driver);


x(url, {
    name: 'title',
    ogDescription: 'meta[property="og:description"]@content',
    metaDescription: 'meta[name="description"]@content',
    ogImage: 'meta[property="og:image"]@content',
    twitterImage: 'meta[name="name="twitter:image:src""]@content',
    metaImage: 'meta[name="image"]@content',
    headImage: 'head img@src',
    contentImage_1: '.content img@src',
    contentImage_2: '.image img@src'
})
(function (err, obj) {
    var firstData = {
        name: [
            obj.name
        ],
        description: [
            obj.metaDescription, 
            obj.ogDescription,
        ],
        image: [
            obj.ogImage,
            obj.twitterImage,
            obj.metaImage,
            obj.headImage,
            obj.contentImage_1,
            obj.contentImage_2
        ]
    }
    console.log(firstData);

});

#1


0  

you can use request as x-ray's driver and iconv the body in it like this:

你可以使用请求作为x-ray的驱动程序,并在其中使用iconv,如下所示:

var options = {};
var conv = null;
options.encoding = 'binary';
iconv = new require('iconv').Iconv('Windows-1251', 'utf8');
conv = function(body) {
    if (!body) return body;
    body = new Buffer.from(body, 'binary');
    return iconv.convert(body).toString();
}

var request = require('request').defaults(options);
var driver = function driver(context, callback) {
    var url = context.url;
    request(url, function(err, response, body) {
        if (!err && conv) body = conv(body);
        return callback(err, body);
    })
};
x.driver(driver);


x(url, {
    name: 'title',
    ogDescription: 'meta[property="og:description"]@content',
    metaDescription: 'meta[name="description"]@content',
    ogImage: 'meta[property="og:image"]@content',
    twitterImage: 'meta[name="name="twitter:image:src""]@content',
    metaImage: 'meta[name="image"]@content',
    headImage: 'head img@src',
    contentImage_1: '.content img@src',
    contentImage_2: '.image img@src'
})
(function (err, obj) {
    var firstData = {
        name: [
            obj.name
        ],
        description: [
            obj.metaDescription, 
            obj.ogDescription,
        ],
        image: [
            obj.ogImage,
            obj.twitterImage,
            obj.metaImage,
            obj.headImage,
            obj.contentImage_1,
            obj.contentImage_2
        ]
    }
    console.log(firstData);

});