0

where i need to retrieve newly fetched records based on timestamp. I used "max" which only gives 1 record, and so is the case with desc and limit

I need to dynamically fetch records when the data loaded into the table

       Task     Task-B    Timestamp
       aaa      bbb       2020-09-02 16:45:12
       aa2      bb2       2020-09-02 17:16:10
       aa3      bb3       2020-09-03 10:09:15
       aa4      bb4       2002-09-01 09:14:34 

Task aaa to aa3 are new, i need to retrieve only that

       Task     Task-B    Timestamp
       aaa      bbb       2020-09-02 16:45:12
       aa2      bb2       2020-09-02 17:16:10
       aa3      bb3       2020-09-03 10:09:15
       
sanjana jha
  • 209
  • 3
  • 14

1 Answers1

0

Task: show data last 24 hours.

import org.apache.spark.sql.functions._
import spark.implicits._
    
case class Task(task: String, taskR: String, timestamp: String)

val sourceDF = Seq(Task("aaa", "bbb", "2020-09-02 16:45:12"),
  Task("aa2", "bb2", "2020-09-02 17:16:10"),
Task("aa3", "bb3", "2020-09-03 10:09:15"),
Task("aa4", "bb4", "2002-09-01 09:14:34")
).toDF()

sourceDF.show(false)
//  +----+-----+-------------------+
//  |task|taskR|timestamp          |
//  +----+-----+-------------------+
//  |aaa |bbb  |2020-09-02 16:45:12|
//  |aa2 |bb2  |2020-09-02 17:16:10|
//  |aa3 |bb3  |2020-09-03 10:09:15|
//  |aa4 |bb4  |2002-09-01 09:14:34|
//  +----+-----+-------------------+


val resDF = sourceDF
  .where(((unix_timestamp(current_timestamp()) - unix_timestamp('timestamp)) / 3600) <= 24)

resDF.show(false)
resDF.printSchema()
//  +----+-----+-------------------+
//  |task|taskR|timestamp          |
//  +----+-----+-------------------+
//  |aaa |bbb  |2020-09-02 16:45:12|
//  |aa2 |bb2  |2020-09-02 17:16:10|
//  |aa3 |bb3  |2020-09-03 10:09:15|
//    +----+-----+-------------------+
//
//  root
//  |-- task: string (nullable = true)
//  |-- taskR: string (nullable = true)
//  |-- timestamp: string (nullable = true)
mvasyliv
  • 1,214
  • 6
  • 10