1

// I am learning c# for the first time working in .net and trying to hit and return the HTML for a list of craigslist URLs for each city in the US. The format of my URL looks like this after concatenation. https://annarbor.craigslist.org/search/sss?query=searchTermFromInput&purveyor-input=all&srchType=T The URL for each city is being pulled from a text file locally. I can run this in postman and get the data back but using httpClient I always get 403. I have been stuck on this since 10am and would really appreciate any advice. Thanks

using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Logging;
using System;
using System.IO;
using System.Net.Http;
using System.Reflection;
using System.Text.RegularExpressions;

namespace ScraperOne
{
    public partial class Interface : IInterface
    {
        private readonly ILogger<Interface> _log;
        private readonly IConfiguration _config;
        private readonly string filePathToURL = Environment.CurrentDirectory;
        private readonly string startPath = @"/search/sss?query=";
        private readonly string endPath = @"&purveyor-input=all&srchType=T";
         
        public Interface(ILogger<Interface> log, IConfiguration config)
        {
            _log = log;
            _config = config;
        }

public async System.Threading.Tasks.Task Run()
        {
            Console.ForegroundColor = ConsoleColor.Green;
            Console.WriteLine("{0}: Version{1} by {2}", _config.GetValue<string>("appName"), _config.GetValue<int>("version"), _config.GetValue<string>("Author"));
            Console.ResetColor();
            Console.WriteLine("Enter your key terms seperated by commas:");
            string keyTerms = Console.ReadLine();
            var searchTermsList = Regex.Split(keyTerms, @"\s*,\s*");
            var searchString = string.Join('+',searchTermsList);
            string[] list = File.ReadAllLines(filePathToURL + @"\siteListCl.txt");
            foreach (string query in list) 
            {
            var thisListItem = query;
            var urlHit = thisListItem + startPath + searchString + endPath;
            var httpClient = new HttpClient();
                httpClient.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv: 85.0) Gecko / 20100101 Firefox / 85.0");
                httpClient.DefaultRequestHeaders.Add("Connection", "keep-alive");
                try
                {
                    Console.WriteLine(urlHit);
                    string responseBody = await httpClient.GetStringAsync(urlHit);
                    Console.WriteLine(responseBody);
                }

                catch (Exception e)
                {
                    Console.WriteLine("exception caught!!");
                    Console.WriteLine(e);
                }
            }
            string wait = Console.ReadLine();
        }

    }

}

CDhipple
  • 21
  • 4
  • to see whats the message for `403`, try `var response = await httpClient.GetAsync(urlHit);` `var body = await response.Content.ReadAsStringAsync();` inspect body and take it from there. – ManiVI Feb 19 '21 at 02:44
  • 1
    I just ran it and this was the response, thank you.https://annarbor.craigslist.org/search/sss?query=wr&purveyor-input=all&srchType=T StatusCode: 403, ReasonPhrase: 'Forbidden', Version: 1.1, Content: System.Net.Http.HttpConnectionResponseContent, Headers: { Set-Cookie: cl_b=4|5a98db4c16d6b9bafb096a566c6d11bb71bfdd2e|1613703608YJ4Vs;path=/;domain=.craigslist.org;expires=Fri, 01-Jan-2038 00:00:00 GMT Strict-Transport-Security: max-age=63072000 Content-Length: 117 } This IP has been automatically blocked. If you have questions, please email: blocks-b1613521967493419@craigslist.org – CDhipple Feb 19 '21 at 03:01
  • I have switched out locations on my vpn everytime I run this, still the same error – CDhipple Feb 19 '21 at 03:03
  • Have you had any luck with this? We started experiencing unexplained 403s from our .NET traffic on some sites around the time this was posted. Those sites are accepting python requests/scrapy traffic without issue (200s). From all investigated angles, the requests from python and .NET appear to be the the same. – pwinz Feb 25 '21 at 18:23
  • Other questions about the same topic had success with WebClient (instead of HttpClient), and wb.Headers.Add("User-Agent: ..."); See https://stackoverflow.com/a/6905471/303690 and https://stackoverflow.com/a/2794346/303690 – CLS Sep 22 '21 at 15:48
  • As the comment above by CDhipple states, the reason is that the site (Craigslist) was blocking his IP address. – Judah Gabriel Himango May 05 '22 at 19:25

0 Answers0