0

I'm scraping a site and the data I want is included in a script tag of an html page, I wrote a re code to find a match but it seems I am doing it the wrong way.

    Hub = {};
    Hub.config = {
        config: {},
        get: function(key) {
            if (key in this.config) {
                return this.config[key];
            } else {
                return null;
            }
        },
        set: function(key, val) {
            this.config[key] = val;
        }
    };

    Hub.config.set('sku', {
        valCartInfo      : {
            itemId : '576938415361',
            cartUrl: '//cart.mangolane.com/cart.htm'
        },
        apiRelateMarket  : '//tui.mangolane.com/recommend?appid=16&count=4&itemid=576938415361',
        apiAddCart       : '//cart.mangolane.com/add_cart_item.htm?item_id=576938415361',
        apiInsurance     : '',
        wholeSibUrl      : '//detailskip.mangolane.com/service/getData/1/p1/item/detail/sib.htm?itemId=576938415361&sellerId=499095250&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,amountRestriction,couponActivity,soldQuantity,page,originalPrice,tradeContract',
        areaLimit        : '',
        bigGroupUrl      : '',
        valPostFee       : '',
        coupon           : {
            couponApi         : '//detailskip.mangolane.com/json/activity.htm?itemId=576938415361&sellerId=499095250',
            couponWidgetDomain: '//assets.mgcdn.com',
            cbUrl             : '/cross.htm?type=weibo'
        },
        valItemInfo      : {

            defSelected: -1,
            skuMap     : {";20549:103189693;1627207:811754571;":{"price":"528.00","stock":"2","skuId":"4301611864655","oversold":false},
                          ";20549:59280855;1627207:412796441;":{"price":"528.00","stock":"2","skuId":"4432149803707","oversold":false},
                          ";20549:59280855;1627207:196576508;":{"price":"528.00","stock":"2","skuId":"4018119863100","oversold":false},
                          ";20549:72380707;1627207:28341;":{"price":"528.00","stock":"2","skuId":"4166690818570","oversold":false},
                          ";20549:418624880;1627207:28341;":{"price":"528.00","stock":"2","skuId":"4166690818566","oversold":false},
                          ";20549:418624880;1627207:196576508;":{"price":"528.00","stock":"2","skuId":"4018119863098","oversold":false},
                          ";20549:72380707;1627207:3224419;":{"price":"528.00","stock":"2","skuId":"4166690818571","oversold":false},
                          ";20549:147478970;1627207:196576508;":{"price":"528.00","stock":"2","skuId":"4018119863094","oversold":false},
                          ";20549:72380707;1627207:384366805;":{"price":"528.00","stock":"2","skuId":"4432149803708","oversold":false},
                          ";20549:296172561;1627207:811754571;":{"price":"528.00","stock":"2","skuId":"4301611864659","oversold":false},
                          ";20549:72380707;1627207:1150336209;":{"price":"528.00","stock":"2","skuId":"4301611864664","oversold":false},
                          ";20549:147478970;1627207:93586002;":{"price":"528.00","stock":"2","skuId":"4018119863095","oversold":false}}
            ,propertyMemoMap: {"1627207:811754571":"黑色单里(预售) 年后2.29发货","1627207:93586002":"黑色加绒 现货","1627207:412796441":"黑色(兔毛) 现货","1627207:384366805":"米白色(兔毛) 现货","1627207:3224419":"驼色 现货","1627207:1150336209":"驼色单里(预售) 年后2.29发货","1627207:28341":"黑色 现货","1627207:196576508":"驼色加绒 现货"}


        }
    });

I need to get only the data in Hub.config.set('sku'

I did this but it didnt work

config_base_str = re.findall("Hub.config.set ({[\s\S]*?});", config) where config is the string of data

  • Does this answer your question? [RegEx match open tags except XHTML self-contained tags](https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags) – Hamms Feb 14 '20 at 01:09
  • No please, thanks for this –  Feb 14 '20 at 01:19
  • What is the current result? – AMC Feb 14 '20 at 01:43
  • I think there's some confusion here on what the regular expression is going to match. The curly braces aren't part of the string - they're part of the object that holds many strings. So you can't match those. And on a side topic, the word "code" is a collective - "people write code" -not- "people write codes" =) – Todd Feb 14 '20 at 02:40
  • @AMC i got an empty list –  Feb 14 '20 at 09:42

1 Answers1

0

The period and parenthesis have a special meaning in regex. If you want to search for the literal characters, you will need to escape them first with a backslash.

For example assuming the string:

config = """
    Hub.config.set('sku', {
    valCartInfo      : {
        itemId : '576938415361',
        cartUrl: '//cart.mangolane.com/cart.htm'
    },
.........
};
"""

If you only want the key, you can do something like this:

config_base_str = re.findall("Hub\.config\.set\('(\w*)", config)  # ['sku']

If you want everything after the key within the brackets, you can do something like this instead:

config_base_str = re.findall("Hub\.config\.set\('\w*',\s*({[\s\S]*})", config)  #  ["{\n valCartInfo : {} ...}"]

https://regex101.com/r/QHdaG2/3/

Andrew L
  • 1,164
  • 12
  • 20
  • thanks for this awesome snippet, it worked but i had a problem there's another Js object that matches this pattern so it also returned it , it starts with ```Hub.config.set('desc',``` instead and i'm only interested in that of sku, i did a string slicing to find the first ') –  Feb 14 '20 at 09:53
  • In that case, you can just replace the \w* with sku. "Hub\.config\.set\('sku',\s*({[\s\S]*})" – Andrew L Feb 14 '20 at 12:20