0

I' trying to scrape a html page and make it into a json object

this is the page the page

<html><head><title>Index</title><meta charset="UTF-8"></head><body><div><p>[ <a href="index.html">Index</a> ] | [ <a href="config.html">Device Config</a> ]</p></div><div>Neighbors<pre>fe80::212:4b00:8b8:6ecb REACHABLE</pre></div><div>Default Route<pre>fe80::212:4b00:8b8:6ecb</pre></div><div>Routes<pre></pre></div><div>Sensors<pre>Battery Temp = 19 C
Battery Volt = 3320 mV
Air Pressure = 1031.12 hPa
Air Temp = 22.66 C
Object Temp = 12.375 C
Ambient Temp = 23.062 C
Light = 0.00 lux
HDC Humidity = 43.93 %RH
HDC Temp = 23.03 C
Acc X = 0.02 G
Acc Y = 0.02 G
Acc Z = -1.10 G
Gyro X = -2.93 deg per sec
Gyro Y = -2.74 deg per sec
Gyro Z = 5.18 deg per sec</pre></div><div>Page hits: 4<br>Uptime: 138 secs<br></div></body></html>

This is my attempt :

var request = require('request');
var cheerio = require('cheerio');





request('http://[aaaa::212:4b00:c2a:b704]/index.html', function(error, response,html){
        if(!error && response.statusCode == 200){
          //JSON.parse(html)
          //console.log('--------------------------------------');
          var temp = {"id":html}
          var obj = JSON.parse(temp)
          console.log(JSON.stringify(obj));
        }

});

how can I achieve putting the sensors div into an object that contain the sensors names as keys and the data of the sensors as proprieties

UPDATE :

thanks to Rafal Wiliński help I somehow managed to get it working but the last key is taking divs as a value in the object

new code :

var request = require('request');
var cheerio = require('cheerio');



 request('http://[aaaa::212:4b00:c2a:b704]/index.html', function(error, response,html){
        if(!error && response.statusCode == 200){

          var obj = {};
          html.split('\n').forEach((line) => {
             var key = line.split(' = ')[0];
             var value = line.split(' = ')[1];
             obj[key] = value;
          });
          console.log(JSON.stringify(obj,null,' '))


        }});

but my output is

{
 "Battery Temp": "22 C",
 "Battery Volt": "3320 mV",
 "Air Pressure": "1031.36 hPa",
 "Air Temp": "26.09 C",
 "Object Temp": "15.531 C",
 "Ambient Temp": "26.312 C",
 "Light": "0.08 lux",
 "HDC Humidity": "34.73 %RH",
 "HDC Temp": "26.38 C",
 "Acc X": "0.02 G",
 "Acc Y": "0.00 G",
 "Acc Z": "-1.05 G",
 "Gyro X": "-2.11 deg per sec",
 "Gyro Y": "-1.10 deg per sec",
 "Gyro Z": "3.64 deg per sec</pre></div><div>Page hits: 18<br>Uptime: 2968 secs<br></div></body></html>"
}
LearningDev
  • 45
  • 3
  • 9
  • Where would `html` come from? You do not create that variable. You would need to split `html` (once referenced) by `\n`, then by `=` and put that then into a JSON object. – Alexander Leithner Apr 04 '17 at 16:33
  • sorry I've never worked with html before, only brefly so I still don't understand, can you provide an example on how I would make an object called sensors and the values and keys are the sensor data for each ? – LearningDev Apr 04 '17 at 16:38

2 Answers2

2

You need to divide your string by = signs. Part before is a key, part after is your value.

Following function might resolve this issue:

function jsonify(str) {
   var obj = {};
   str.split('\n').forEach((line) => {
      var key = line.split(' = ')[0];
      var value = line.split(' = ')[1];
      obj[key] = value;
   });
   return obj;
}
Rafal Wiliński
  • 2,240
  • 1
  • 21
  • 26
0

I'd recommend that you use an HTML parser (I personally think jQuery is easy to use, but there are a LOT of options) to find and get the content from a specific element. Then you can run your parse logic on the result.

var response = '<html><head><title>Index</title><meta charset="UTF-8"></head><body><div><p>[ <a href="index.html">Index</a> ] | [ <a href="config.html">Device Config</a> ]</p></div><div>Neighbors<pre>fe80::212:4b00:8b8:6ecb REACHABLE</pre></div><div>Default Route<pre>fe80::212:4b00:8b8:6ecb</pre></div><div>Routes<pre></pre></div><div>Sensors<pre>Battery Temp = 19 C\nBattery Volt = 3320 mV\nAir Pressure = 1031.12 hPa\nAir Temp = 22.66 C\nObject Temp = 12.375 C\nAmbient Temp = 23.062 C\nLight = 0.00 lux\nHDC Humidity = 43.93 %RH\nHDC Temp = 23.03 C\nAcc X = 0.02 G\nAcc Y = 0.02 G\nAcc Z = -1.10 G\nGyro X = -2.93 deg per sec\nGyro Y = -2.74 deg per sec\nGyro Z = 5.18 deg per sec</pre></div><div>Page hits: 4<br>Uptime: 138 secs<br></div></body></html>';

// Turn the result into an HTML DOM.
var responseDOM = $(response);

// Find the specific element you want (in this case, the third pre) and get its content.
var preContent = $('pre', responseDOM).eq(3).text();

// Now, split the content into lines, split again by " = ", and then merge the result back into a single object.
var obj = preContent
      // Split content into lines (by "\n")
      .split('\n')
      // split each line into key and value (by " = ")
      .map(line => line.split(' = '))
      // reduce each key value pair into a single object with properties
      .reduce( (acc,kvp) => { acc[kvp[0]] = kvp[1]; return acc; }, {})


// Finally, turn the object into a JSON string.
var json = JSON.stringify(obj);

console.log(json);
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
Community
  • 1
  • 1
JDB
  • 25,172
  • 5
  • 72
  • 123
  • thank you so much JDB, but unfortunately i never worked with jquery before, so i don't even know how to run this example, I only know basic JavaScript es6 with nodejs as I mostly program for embedded devices and systems, but a very clean approach I must say :D – LearningDev Apr 04 '17 at 17:44
  • If you are going to do web scraping, then you definitely need some kind of proper HTML parser (follow the link in my post). Trying to roll your own is like trying to roll your own memory management or your own encryption. Aside from the headaches... why would you when so many high quality options already exist! :) – JDB Apr 04 '17 at 17:50
  • exactly , no one should re invent the wheel , but the concept is so new to me that I was lost after hours of research lol yeah I'll definitely have a look at the link you posted to explore other options as well, cheerz, p.s. awesome avatar :D – LearningDev Apr 04 '17 at 17:53