0

I'm trying to scrape a website using select crate in rust. Here is site structure:

example.com/category-list/


example.com/cat/programming-questions/
qtitle = "How to become a programmer"
qid = 2

example.com/cat/networking-questions/
qtitle = "New question"
qid = 3
qtitle = "Other question"
qid = 4

Scraper code:


#[derive(Serialize, Deserialize, Debug)]
pub struct Question {
    q_title: Vec<String>,
    q_id: Vec<String>,
    q_link: Vec<String>,

}

let mut questions_vector= Vec::new();

for response in reponse_list.iter() {
        let mut q_title: Vec<String> = Vec::new();
        Document::from(resp.as_str())
            .select(Class("qTitle"))
            .for_each(|f| q_title.push(f.to_string()));


        let mut q_id: Vec<String> = Vec::new();
        Document::from(resp.as_str())
            .select(Class("qid"))
            .for_each(|f| q_id.push(f.to_string()));

        let mut q_link: Vec<String> = Vec::new();
        Document::from(resp.as_str())
            .select(Name("a"))
            .filter_map(|f| f.attr("href"))
            .for_each(|f| q_link.push(f.to_string()));

        let question = Question {
            q_title: q_title,
            q_id: q_id,
            q_link: q_link,

        };

        questions_vector.push(question);

}

println!("{}", serde_json::to_string(&questions_vector).unwrap());

Output of my code:

  {
    "q_title": [
      "How to become a programmer",
    ],
    "q_id": [
      "2",
    ],
    "q_link": [
      "https://example.com/q/How-to-become-a-programmer"
    ]
  },
 {
    "q_title": [
      "New question",
      "Other question"
    ],
    "q_id": [
      "3","4"
    ],
    "q_link": [
      "https://example.com/q/New-question",
      "https://example.com/q/Other-question"
    ]
  }

Purposed output:

  {
    "q_title": [
      "How to become a programmer",
    ],
    "q_id": [
      "2",
    ],
    "q_link": [
      "https://example.com/q/How-to-become-a-programmer"
    ]
  },
 {
    "q_title": [
      "New question",

    ],
    "q_id": [
      "3",
    ],
    "q_link": [
      "https://example.com/q/New-question",

    ]
  },
  {
    "q_title": [
      "Other question"
    ],
    "q_id": [
      "4",
    ],
    "q_link": [
      "https://example.com/q/Other-question"
    ]
  },

I need to refactor my code to achieve purposed output

Cham Lake
  • 1
  • 1
  • What is your question? – Jonas Apr 29 '21 at 17:42
  • @Jonas I wrote it at the bottom of question: I need to refactor my code to achieve purposed output. – Cham Lake Apr 29 '21 at 18:14
  • You might want to [zip all 3 of your iterators together](https://stackoverflow.com/questions/29669287/how-can-i-zip-more-than-two-iterators), and then iterate over those tuples to create your `Question` object. – Michael Keane Galloway Apr 29 '21 at 18:22
  • @ChamLake this is not really a rust specific question, I believe. What you do now, is you find a collection of `href` elements, you add all of them to a single `Question` struct and push it to a vector. What you want to do is to create a new `Question` instance for each href you find, and push each of them to a vector. – lpiepiora Apr 30 '21 at 07:59
  • 1
    Thanks @MichaelKeaneGalloway, This is the solution. It works for me. – Cham Lake Apr 30 '21 at 12:50

0 Answers0