1

Full-import failing when using CachedSqlEntityProcessor giving Exception

java.lang.OutOfMemoryError: GC overhead limit exceeded

How can i resolve this Issue.......

Without using CachedSqlEntityProcessor it is taking 15 hrs to index

and My products-data-config.xml is

<dataConfig>
<dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver" url="jdbc:mysql://localhost:3306/localbazaar" user="root" password="sa" batchSize="100" />
    <document name="products">
        <entity name="domainProduct" query="SELECT p.PRODUCT_ID, p.NAME, LOWER(REPLACE(REPLACE(p.NAME,' ','-'),'/','-')) AS purl, p.description, p.BRAND_ID, p.CATEGORY_ID, p.GROUP_ID, p.MIN_PRICE, p.MAX_PRICE, p.AUTHOR, p.ISBN10, p.ISBN13, p.OLID, p.EAN13, p.UPCA, p.SKU, p.LANGUAGE, p.FORMAT, p.PUBLISHER, p.SUBJECT, c.NAME AS cname, c.URL_NAME, b.NAME AS bname, LOWER(REPLACE(REPLACE(b.NAME,' ','-'),'/','-')) AS bUrl, CONCAT('http://partnercenter.localbazaar.com/image?imageId=',i.IMAGE_NAME) AS productImage FROM product_t p LEFT OUTER JOIN category_t c ON (c.CATEGORY_ID=p.CATEGORY_ID) LEFT OUTER JOIN brand_t b ON (b.BRAND_ID=p.BRAND_ID) LEFT OUTER JOIN image_t i ON (i.ASSET_ID=p.PRODUCT_ID AND i.ASSET_TYPE_ID = 4 AND i.IMAGE_TYPE_ID = 0)">
            <field column="PRODUCT_ID" name="productId" />
            <field column="NAME" name="productName" />
            <field column="purl" name="productUrlName" />
            <field column="description" name="productDescription" />
            <field column="BRAND_ID" name="brandId" />
            <field column="CATEGORY_ID" name="categoryId" />
            <field column="GROUP_ID" name="groupId" />
            <field column="MIN_PRICE" name="minPrice" />
            <field column="MAX_PRICE" name="maxPrice" />
            <field column="AUTHOR" name="author" />
            <field column="ISBN10" name="isbn10" />
            <field column="ISBN13" name="isbn13" />
            <field column="OLID" name="olid" />
            <field column="EAN13" name="ean13" />
            <field column="UPCA" name="upca" />
            <field column="SKU" name="sku" />
            <field column="LANGUAGE" name="language" />
            <field column="FORMAT" name="format" />
            <field column="PUBLISHER" name="publisher" />
            <field column="SUBJECT" name="subject" />
            <field column="cname" name="categoryName" />
            <field column="URL_NAME" name="categoryUrlName" />
            <field column="bname" name="brandName" />
            <field column="bUrl" name="brandUrlName" />
            <field column="productImage" name="productImage" />
            <entity name="specifications" query="select PRODUCT_ID, CONCAT(PROPERTY_NAME,':::',property_value) as specifications FROM product_properties_t " processor="CachedSqlEntityProcessor" where="PRODUCT_ID=domainProduct.PRODUCT_ID" />
        </entity>
    </document>
</dataConfig>

and My store-products-data-config.xml is

<dataConfig>
    <dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver" url="jdbc:mysql://localhost:3306/localbazaar" user="root" password="sa" batchSize="100" />
    <document name="products">
        <entity name="domainStoreProduct" query="SELECT sp.STORE_PRODUCT_ID, sp.STORE_ID, sp.PRODUCT_ID, sp.MIN_PRICE, sp.MAX_PRICE, sp.STORE_TYPE_ID, sp.BUY_X, sp.GET_Y, s.NAME AS sname, LOWER(REPLACE(REPLACE(s.NAME,' ','-'),'/','-')) AS sUrl, s.DESCRIPTION AS sdesc, s.WEB_SITE_UTL, s.EMAIL, s.PHONE, s.MOBILE, s.ACTIVE AS act, a.ADDRESS_ID, a.location, LOWER(REPLACE(REPLACE(a.location,' ','-'),'/','-')) AS urlLoc, a.ADDRESS_LINE1, a.ADDRESS_LINE2, a.LATITUDE, a.LONGITUDE, a.zipcode, a.LANDMARK, a.CITY, CONCAT(a.LATITUDE,',',a.LONGITUDE) AS ll, p.NAME AS pname, LOWER(REPLACE(REPLACE(p.NAME,' ','-'),'/','-')) AS purl, p.description AS pdesc, p.BRAND_ID, p.CATEGORY_ID, p.GROUP_ID, p.AUTHOR, p.ISBN10, p.ISBN13, p.OLID, p.EAN13, p.UPCA, p.SKU, p.LANGUAGE, p.FORMAT, p.PUBLISHER, p.SUBJECT, c.NAME AS cname, c.URL_NAME, b.NAME AS bname, LOWER(REPLACE(REPLACE(b.NAME,' ','-'),'/','-')) AS bUrl, CONCAT('http://partnercenter.localbazaar.com/image?imageId=',ip.IMAGE_NAME) AS pImage, CONCAT('http://partnercenter.localbazaar.com/image?imageId=',ist.IMAGE_NAME) AS sImage, ci.CITY_ID FROM store_products_t sp LEFT OUTER JOIN store_t s ON (sp.STORE_ID=s.STORE_ID) LEFT OUTER JOIN address_t a ON (a.ASSET_TYPE_ID=3 AND a.ASSET_ID=sp.STORE_ID) LEFT OUTER JOIN product_t p ON (p.PRODUCT_ID=sp.PRODUCT_ID) LEFT OUTER JOIN category_t c ON (c.CATEGORY_ID=p.CATEGORY_ID) LEFT OUTER JOIN brand_t b ON (b.BRAND_ID=p.BRAND_ID) LEFT OUTER JOIN image_t ip ON (ip.ASSET_ID=sp.PRODUCT_ID AND ip.ASSET_TYPE_ID=4 AND ip.IMAGE_TYPE_ID=0) LEFT OUTER JOIN image_t ist ON (ist.ASSET_ID=sp.STORE_ID AND ist.ASSET_TYPE_ID=3 AND ist.IMAGE_TYPE_ID=0) LEFT OUTER JOIN city_t ci ON (ci.NAME=a.CITY)">
            <field column="STORE_PRODUCT_ID" name="storeProductId" />
            <field column="STORE_ID" name="storeId" />
            <field column="PRODUCT_ID" name="productId" />
            <field column="MIN_PRICE" name="storeMinPrice" />
            <field column="MAX_PRICE" name="storeMaxPrice" />
            <field column="STORE_TYPE_ID" name="storeTypeId" />
            <field column="BUY_X" name="buyX" />
            <field column="GET_Y" name="getY" />
            <field column="sname" name="storeName" />
            <field column="sUrl" name="storeUrlName" />
            <field column="sdesc" name="description" />
            <field column="WEB_SITE_UTL" name="webSiteUrl" />
            <field column="EMAIL" name="email" />
            <field column="PHONE" name="phone" />
            <field column="MOBILE" name="mobile" />
            <field column="act" name="active" />
            <field column="ADDRESS_ID" name="addressId" />
            <field column="location" name="location" />
            <field column="urlLoc" name="urlLocation" />
            <field column="ADDRESS_LINE1" name="addressLine1" />
            <field column="ADDRESS_LINE2" name="addressLine2" />
            <field column="LATITUDE" name="latitude" />
            <field column="LONGITUDE" name="longitude" />
            <field column="zipcode" name="zipcode" />
            <field column="LANDMARK" name="landmark" />
            <field column="CITY" name="city" />
            <field column="ll" name="latlong" />
            <field column="pname" name="productName" />
            <field column="purl" name="productUrlName" />
            <field column="pdesc" name="productDescription" />
            <field column="BRAND_ID" name="brandId" />
            <field column="CATEGORY_ID" name="categoryId" />
            <field column="GROUP_ID" name="groupId" />
            <field column="AUTHOR" name="author" />
            <field column="ISBN10" name="isbn10" />
            <field column="ISBN13" name="isbn13" />
            <field column="OLID" name="olid" />
            <field column="EAN13" name="ean13" />
            <field column="UPCA" name="upca" />
            <field column="SKU" name="sku" />
            <field column="LANGUAGE" name="language" />
            <field column="FORMAT" name="format" />
            <field column="PUBLISHER" name="publisher" />
            <field column="SUBJECT" name="subject" />
            <field column="cname" name="categoryName" />
            <field column="URL_NAME" name="categoryUrlName" />
            <field column="bname" name="brandName" />
            <field column="bUrl" name="brandUrlName" />
            <field column="pImage" name="productImage" />
            <field column="sImage" name="storeImage" />
            <field column="CITY_ID" name="cityId" />
            <entity name="specifications" query="select PRODUCT_ID, CONCAT(PROPERTY_NAME,':::',property_value) as specifications FROM product_properties_t " processor="CachedSqlEntityProcessor" WHERE="PRODUCT_ID= domainStoreProduct.PRODUCT_ID" />
        <entity name="storeProperties" query="select STORE_ID, CONCAT(PROPERTY_ID,':::',PROPERTY_VALUE) as storeProperties FROM store_properties_t " processor="CachedSqlEntityProcessor" WHERE="STORE_ID=domainStoreProduct.STORE_ID" />
        </entity>
    </document>
</dataConfig>
Satish V
  • 53
  • 7

1 Answers1

2

You can try different things:

  1. Try setting the batchSize property. If you tune it correctly, you can increase the performance of your datasource.
  2. SELECT * is ALWAYS slower than providing the columns you need (even if you need all columns). I would suggest using SELECT PRODUCT_ID, NAME, ... in stead of using *
  3. Why do you have the entities b, i and s? You don't use the fields from it, so I don't think they're very useful
  4. Try using the CachedSqlEntityProcessor for your sub-entities. It will only retrieve the data once and re-use it for each subenttiy.
  5. Can your product belong to more than 1 category (is it a multivalued field?), if not, then writing one query using JOINS is faster than writing multiple entities.

EDIT: I suggest seperating this thing into 2 questions because now it's really weird for other people to read your new question with my old answer.

I don't think you can choose where the CachedSqlEntityProcessor will put his cache (it's always in memory I think). The problem with your 8 hours of data import is that, because we're talking about a lot of records, a lot of queries will be used (every subentity uses its own query).

The solution to your problem is to remove the subentity and in your parent entity add the query of your subentity as a comma seperated list. I suggest looking at this answer.

If you do this, all your specifications (for examples) will be stored inside one column as a comma speerated list. You can then use a Solr ScriptTransformer to split the values and create multiple values.

This limits the number of queries to 1 big query and will also limit the use of RAM since it will parse each query individually. I have no clue what the performance will be, because you will have to parse each entity individually.

If this doesn't work I don't think there is a better solution than to wait 8 hours for the data import to complete. You can't expect that Solr will index it all in 1 2 3. You can try using a cronjob to run this task over night.

Community
  • 1
  • 1
g00glen00b
  • 41,995
  • 13
  • 95
  • 133
  • Thanks Dimitri Now I can Export all the Data in 1 Min and CachedSqlEntityProcessor is Much Help Full and I also removed the entities b,i and s by using joins – Satish V Apr 27 '13 at 04:56
  • Dimitri I facing problems with the CachedSqlEntityProcessor can you help me on this and I am using solr-4.2 – Satish V Apr 27 '13 at 07:09
  • You're not using the CachedSqlEntityProcessor properly (now it's acting like a normal SQL processor). You should leave the `WHERE` clausule and put it in the `where` field of your entity (or in the `cacheKey`/`cacheLookup` variant). Look at example 2 and 3 of the Solr documentation (look at the link I gave you earlier) – g00glen00b Apr 27 '13 at 13:56
  • Dimitri I tried with the 3 rd example now the Product Properties not Indexed at all – Satish V Apr 29 '13 at 04:43
  • I tried with where clause shown in the second example with that also specifications are not indexing – Satish V Apr 29 '13 at 05:01
  • can i know what is wrong with the entity, are there any mistakes in it. Thanks For spending Time to resolve this Isssue. – Satish V Apr 29 '13 at 05:05
  • No clue, I never had to work with the CachedSqlEntityProcessor beforen but can you try to use `where="PRODUCT_ID=${domainProduct.PRODUCT_ID}`? – g00glen00b Apr 30 '13 at 10:04
  • I also tried that but it is giving null value for ${domainProduct.PRODUCT_ID} in where="PRODUCT_ID=${domainProduct.PRODUCT_ID} – Satish V May 01 '13 at 05:12
  • the products are indexing in 1 minute in products-data-config.xml with the line actually CachedSqlEntityProcessor is importing all the data from product_properties_t into memory and heap memory availabe in my system is 1GB. – Satish V May 01 '13 at 05:23
  • But in case of store-products-data-config.xml there is one more table store_properties_t witch has many number of records by using CachedSqlEntityProcessor on that table the heap size is greater than the size available in my system so it is throwing an exception java.lang.OutOfMemoryError: GC overhead limit exceeded. If i removed CachedSqlEntityProcessor the DIH is taking 8 hrs to index. I have to look for another solution for this. I cant increase the heap size b/c my ram is limited. we can only use CachedSqlEntityProcessor if we dont have any limits on Ram memory. – Satish V May 01 '13 at 05:25