0

I am trying to implement the built in kmeans() function using POSTGIS as follows:

create or replace FUNCTION kmeans (number_of_clusters INTEGER) RETURNS TEXT AS
$$
begin
    
with data as(
select at2."name" as area_type, a.name,bf.* from building_footprints bf
left join area a on bf."areaID" = a."ID"
left join area_type at2 on at2."ID" = a."areaTypeID"
where a."ID" in (select "areaID" from campaign_scope)
and at2."name"='Uc'
)
,
dbscan as(
select data.*,ST_ClusterDBSCAN(st_transform(data."geoJson",24313),eps := 50, minPoints := 1) 
  OVER(ORDER BY data."ID") AS cluster_id from data)
,

filter as(
select cluster_id,count("ID") from dbscan
group by cluster_id
having count("ID") < 10
) 

,

accepted_dbscan as
(
select * from dbscan 
where cluster_id in (select cluster_id from filter)

)
,

rejected_dbscan as 
(
select * from dbscan 
where cluster_id not in (select cluster_id from filter)
)

    
RETURN 'SELECT "ID", ST_ClusterKMeans(rejected_dbscan."geoJson", ' || number_of_clusters || ') OVER() AS cluster_id FROM rejected_dbscan';
END;
$$ LANGUAGE PLPGSQL;

This gives an error:

SQL Error [42601]: ERROR: syntax error at or near "RETURN"

Does the entire query need to be in the RETURN clause perhaps? I am also concerned whether to RETURN TEXT or a table with different specified columns.

UPDATE: I have also tried the following:

create or replace FUNCTION kmeans (number_of_clusters INTEGER) 

RETURNS TABLE (ID varchar ,
cluster_id int)

LANGUAGE plpgsql AS  
$func$
DECLARE
number_of_clusters int;
BEGIN

RETURN QUERY    
    
with data as(
select at2."name" as area_type, a.name,bf.* from building_footprints bf
left join area a on bf."areaID" = a."ID"
left join area_type at2 on at2."ID" = a."areaTypeID"
where a."ID" in (select "areaID" from campaign_scope)
and at2."name"='Uc'
)
,
dbscan as(
select data.*,ST_ClusterDBSCAN(st_transform(data."geoJson",24313),eps := 50, minPoints := 1) 
  OVER(ORDER BY data."ID") AS cluster_id from data)
,

filter as(
select cluster_id,count("ID") from dbscan
group by cluster_id
having count("ID") < 10
) 

,

accepted_dbscan as
(
select * from dbscan 
where cluster_id in (select cluster_id from filter)

)
,

rejected_dbscan as 
(
select * from dbscan 
where cluster_id not in (select cluster_id from filter)
)

SELECT "ID", ST_ClusterKMeans(rejected_dbscan."geoJson", ' || number_of_clusters || ') OVER() AS cluster_id FROM rejected_dbscan
    
END
$func$;

but again, this gives the following error:

SQL Error [42601]: ERROR: syntax error at or near "END"
  Position: 1114

Second update: based on the comment by Luuk, I have updated the code to:

create or replace FUNCTION kmeans (number_of_clusters INTEGER) 

RETURNS TABLE (ID varchar ,
cluster_id int)

LANGUAGE plpgsql AS  
$$
DECLARE
number_of_clusters int;
BEGIN

RETURN QUERY    
    
with data as(
select at2."name" as area_type, a.name,bf.* from building_footprints bf
left join area a on bf."areaID" = a."ID"
left join area_type at2 on at2."ID" = a."areaTypeID"
where a."ID" in (select "areaID" from campaign_scope)
and at2."name"='Uc'
)
,
dbscan as(
select data.*,ST_ClusterDBSCAN(st_transform(data."geoJson",24313),eps := 50, minPoints := 1) 
  OVER(ORDER BY data."ID") AS cluster_id from data)
,

filter as(
select cluster_id,count("ID") from dbscan
group by cluster_id
having count("ID") < 10
) 

,

accepted_dbscan as
(
select * from dbscan 
where cluster_id in (select cluster_id from filter)

)
,

rejected_dbscan as 
(
select * from dbscan 
where cluster_id not in (select cluster_id from filter)
)

SELECT "ID", ST_ClusterKMeans(rejected_dbscan."geoJson", ' || number_of_clusters || ') OVER() AS cluster_id FROM rejected_dbscan
    
end;
$$;

But I am being presented with this error -

SQL Error [42601]: ERROR: syntax error at or near "end"

Erwin Brandstetter
  • 605,456
  • 145
  • 1,078
  • 1,228
analyst92
  • 243
  • 1
  • 6
  • 2
    Change `END` to `END;` in you second try. Please do not try to find out how to create a function by _trying_ stuff. Just read [this](https://www.postgresqltutorial.com/postgresql-plpgsql/postgresql-create-function/), it help you do it correct in one go... – Luuk Aug 01 '23 at 11:24
  • @Luuk I have changed the code, as is shown in the"Second update". But the error is still there. – analyst92 Aug 01 '23 at 11:52
  • 3
    Please check for more missing `;`. – Luuk Aug 01 '23 at 12:05
  • 2
    @Luuk, from here [plpgsql structure](https://www.postgresql.org/docs/current/plpgsql-structure.html): *...however the final END that concludes a function body does not require a semicolon.* – Adrian Klaver Aug 01 '23 at 14:54
  • 2
    Missing `;` after `... FROM rejected_dbscan`. From [plpgsql structure](https://www.postgresql.org/docs/current/plpgsql-structure.html): *Each declaration and each statement within a block is terminated by a semicolon. A block that appears within another block must have a semicolon after END, as shown above; however the final END that concludes a function body does not require a semicolon.* – Adrian Klaver Aug 01 '23 at 14:56
  • 1
    @AdrianKlaver: The first example in my [link](https://www.postgresqltutorial.com/postgresql-plpgsql/postgresql-create-function/) does not have a semicolon, the rest of the examples have (But luckily when finally having a semicolon does not break the creation of the function.) – Luuk Aug 01 '23 at 15:01

1 Answers1

0

The immediate cause of the error message is the missing semicolon (;) at the end of the SELECT statement, as was pointed out in comments.

But much of your function is noise or nonsense. Boils down to just:

CREATE OR REPLACE FUNCTION kmeans(_number_of_clusters integer)
  RETURNS TABLE (id varchar, cluster_id int)
  LANGUAGE plpgsql AS
$func$
BEGIN
   RETURN QUERY
   WITH data AS (
      SELECT bf."ID", bf."geoJson"
           , st_clusterdbscan(st_transform(bf."geoJson", 24313), eps => 50, minpoints => 1) OVER (ORDER BY bf."ID") AS cluster_id
      FROM   building_footprints bf
      JOIN   area      a   ON a."ID" = bf."areaID"               -- LEFT JOIN was nonsense
      JOIN   area_type at2 ON at2."ID" = a."areaTypeID"
      WHERE  a."ID" IN (SELECT "areaID" FROM campaign_scope)
      AND    at2.name = 'Uc'
      )
   SELECT "ID"
        , st_clusterkmeans(d."geoJson", ' || _number_of_clusters || ') OVER ()  -- AS cluster_id  -- noise
   FROM  (
      SELECT d1.cluster_id
      FROM   data d1
      GROUP  BY d1.cluster_id
      HAVING count(*) >= 10
      ) filter
   JOIN   data d USING (cluster_id);
END
$func$;

Or simpler with a plain SQL function:

CREATE OR REPLACE FUNCTION kmeans(_number_of_clusters integer)
  RETURNS TABLE (id varchar, cluster_id int)
  LANGUAGE sql AS
$func$
WITH data AS (
   SELECT bf."ID", bf."geoJson"
        , st_clusterdbscan(st_transform(bf."geoJson", 24313), eps => 50, minpoints => 1) OVER (ORDER BY bf."ID") AS cluster_id
   FROM   building_footprints bf
   JOIN   area      a   ON a."ID" = bf."areaID"
   JOIN   area_type at2 ON at2."ID" = a."areaTypeID"
   WHERE  a."ID" IN (SELECT "areaID" FROM campaign_scope)
   AND    at2.name = 'Uc'
   )
SELECT d."ID", st_clusterkmeans(d."geoJson", ' || _number_of_clusters || ') OVER()  -- AS cluster_id   -- noise
FROM  (
   SELECT d1.cluster_id
   FROM   data d1
   GROUP  BY 1
   HAVING count(*) >= 10
   ) filter
JOIN   data d USING (cluster_id);
$func$;

Assuming building_footprints."ID" is defined NOT NULL.

Further reading:

Erwin Brandstetter
  • 605,456
  • 145
  • 1,078
  • 1,228