Source: data-extractor.js

"use strict";




var cheerio = require('cheerio');
var _ = require('underscore');
var fn = require('./fn');

var parseRule = function( val ){
    var rule = {};
    rule.$type = val.constructor.name;
    switch( rule.$type ){
        case 'String':
            rule.$fn = fn.text;
            rule.$rule = val;
            rule.$type = 'function';
            break;
        case 'Array':
            val = val[0];
            rule.$rule = val.$rule || '';
            rule.$fn = parseRule( val );
            delete rule.$fn.$rule;
            break;
        case 'Object':
            var subRules = _.omit( val, '$rule', '$fn', '$type');
            if( Object.keys(subRules).length ){
                rule.$type = 'Object';
                rule.$fn = {};
                rule.$rule = val.$rule || '';
                Object.keys(subRules).forEach(function(k){
                    val = subRules[k];
                    rule.$fn[k] = parseRule( val );
                });
            } else {
                rule.$rule = val.$rule ||'';
                rule.$type = 'function';
                rule.$fn = val.$fn;
            }
            break;
    }
    return rule;
};

exports.parseRule = parseRule;


var getItem = function( $, rule ){
    if(typeof $ === 'string' ){
      $ = cheerio.load($);
    }
    var out;
    var elem = rule.$rule? $.find(rule.$rule) : $;
    switch ( rule.$type ){
        case 'function':
            out = elem.length? rule.$fn( elem ) : '';
            break;
        case 'Object':
            out = {};
            if(elem.length) {
                Object.keys(rule.$fn).forEach(function( key){
                    var subRule = rule.$fn[key];
                    out[key] = getItem( elem, subRule );
                });
            }
            break;
        case 'Array':
            out = [];
            if(elem.length) {
                _.each( elem, function(item){
                    var data = getItem(cheerio(item), rule.$fn );
                    out.push(data);
                });
            }
            break;
    }
    return out;
};
exports.getItem = getItem;


/**
 * Extractor
 * @class
 * Extract data from HTML pages based on as schema.
 * A schema consists of following.
 * * names of fields to be fetched
 * * CSS rules for each fields
 * * Data extractor function for each fields.
 *
 * @param {Object} schema schema definition. Schema definition can have nested schema at any level.
 * @return {undefined}
 */
function Extractor( schema ){
    this.rules = parseRule( schema );
}

/**
 * extract data from a html string/page
 *
 * @param {String} html HTML page.
 * @return {Object} extracted data as per schema definition.
 */
Extractor.prototype.extract = function(html){
    var $ = cheerio(html);
    var out = getItem($, this.rules );
    return out;
};


exports.Extractor = Extractor;