1

I coded a python 3 script that outputs a formatted XML RSS output. But when i open the output xml file in Chrome i see the Newlines are not there . Here is my code :

import requests
import csv
import re
import math
from babel.numbers import format_decimal
from lxml import html, etree
from rfeed import *

class Scraper:

    def __init__(self,url):
        self.session = requests
        self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                        'Accept-Encoding': 'gzip, deflate, br',
                        'Accept-Language': 'en-US,en;q=0.9',
                        'Host': 'www.fpds.gov',
                        'Upgrade-Insecure-Requests': '1',
                        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
        self.url = url

    def make_requests(self):
        r = self.session.get(self.url,headers=self.headers)
        with open('sample.html','w',newline='',encoding='utf-8')as export:
            export.write(r.text.replace('ns1:','ns1').lower())
        return r.text

    def amount_correction(self,source):
        return '$'+format_decimal(math.trunc(float(source)), locale='en_US')

    def open_existing(self):
        return list(set([each.strip() for each in open('existing.csv')]))

    def parse_xml(self):
        Items = []
        Existing_list = self.open_existing(); Existing_list_1 = []
        source = self.make_requests()
        sections = re.findall('<entry>(.+?)</entry>',source,re.DOTALL)
        for section in sections:
            x = lambda x: html.fromstring(section.replace('ns1:','ns1')).xpath(x)[0].strip()
            y = lambda y: re.findall(y,section,re.DOTALL)
            #fundingRequestingOfficeID = x("//ns1fundingrequestingofficeid//text()")
            fundingRequestingOfficeNAME = x("//ns1fundingrequestingofficeid//@name")
            #placeOfPerformanceZIPCode = x('//ns1placeofperformancezipcode//text()')
            placeOfPerformanceCITY = x('//ns1placeofperformancezipcode//@city')
            modified = x('//modified//text()')
            vendorName = x('//ns1vendorname//text()')
            obligatedAmount = self.amount_correction(x('//ns1obligatedamount//text()'))
            href = x('//link[@rel="alternate"]//@href')+"&s=FPDS&templateName=1.4.4&indexName=awardfull&sortBy=SIGNED_DATE&desc=Y"
            effectiveDate = x('//ns1effectivedate//text()').split(' ')[0]
            fundingRequestingOfficeID = x('//ns1fundingrequestingofficeid//text()')
            fundingRequestingOfficename = x('//ns1fundingrequestingofficeid//@name')
            descriptionOfContractRequirement = x('//ns1descriptionofcontractrequirement//text()')
            contractingofficeagencyid = x('//ns1contractingofficeagencyid//@name')
            #contractActionType = x('//ns1agencyid//text()')
            #typeOfContractPricing = x('//ns1typeofcontractpricing//text()')
            #extentCompeted = x('//ns1extentcompeted//text()')
            extentdescription = x('//ns1extentcompeted//@description')
            #productOrServiceCode = x('//ns1productorservicecode//text()')
            productOrServiceCodedescription = x('//ns1productorservicecode//@description')
            #principalNAICSCode = x('//ns1principalnaicscode//text()')
            principalNAICSCodedescription = x('//ns1principalnaicscode//@description')
            currentCompletionDate = x('//ns1currentcompletiondate//text()').split(' ')[0]
            ultimateCompletionDate = x('//ns1ultimatecompletiondate//text()').split(' ')[0]
            totalBaseAndExercisedOptionsValue = self.amount_correction(x('//ns1totalbaseandexercisedoptionsvalue//text()'))
            totalBaseAndAllOptionsValue = self.amount_correction(x('//ns1totalbaseandalloptionsvalue//text()'))
            inherentlyGovernmentalFunction = x('//ns1inherentlygovernmentalfunction//text()')
            inherentlyGovernmentalFunctiondescription = x('//ns1inherentlygovernmentalfunction//@description')
            GFE_GFP = x('//ns1gfe-gfp//text()')
            try:
                multiYearContract = x('//ns1multiyearcontract//text()')
            except:
                multiYearContract = None
            #serviceContractAct = x('//ns1servicecontractact//text()')
            serviceContractActdescription = x('//ns1servicecontractact//@description')
            vendorName = x('//ns1vendorname//text()')
            vendorLocationcity = x('//ns1vendorlocation//ns1city//text()')
            vendorLocationstate = x('//ns1vendorlocation//ns1state//text()')
            annualRevenue = self.amount_correction(x('//ns1annualrevenue//text()'))
            numberOfEmployees = x('//ns1numberofemployees//text()')
            vendorLocationphone = x('//ns1vendorlocation//ns1phoneno//text()')
            isForProfitOrganization = x('//ns1isforprofitorganization//text()')
            contractingOfficerBusinessSizeDetermination = x('//ns1contractingofficerbusinesssizedetermination//@description')
            Title = "{} in {} – {} to {}".format(fundingRequestingOfficeNAME,placeOfPerformanceCITY,obligatedAmount,vendorName)
            Link = "{}".format(href)
            Description = """Effective {}, {} has obligated {} for “{}” to be performed in {}.
                            \n\n
                            This is a {} ({}) for {} under the NAICS Code {}.
                            \n\n                            
                            The current and ultimate completion dates are {} and {} respectively.
                            The total value of base and exercised options is {} out of {}.
                            \n\n
                            Inherently Governmental? {} {}
                            \n
                            Government-furnished? {}
                            \n
                            Multiyear? {}
                            \n
                            Service Contact Act? {}
                            \n\n
                            The vendor is {} in {}, {}.
                            \n
                            Revenue: {}
                            \n
                            Employees: {}
                            \n
                            Phone: {}
                            \n\n
                            For Profit? {}
                            \n
                            Status: {}
                            """.format(effectiveDate,fundingRequestingOfficeNAME,obligatedAmount,descriptionOfContractRequirement,placeOfPerformanceCITY,
                                     contractingofficeagencyid,extentdescription,productOrServiceCodedescription,principalNAICSCodedescription,
                                       currentCompletionDate,ultimateCompletionDate,totalBaseAndExercisedOptionsValue,totalBaseAndAllOptionsValue,
                                       inherentlyGovernmentalFunction,inherentlyGovernmentalFunctiondescription,GFE_GFP,multiYearContract,
                                       serviceContractActdescription,vendorName,vendorLocationcity,vendorLocationstate,annualRevenue,numberOfEmployees,
                                       vendorLocationphone,isForProfitOrganization,contractingOfficerBusinessSizeDetermination)
            #Description = Description.replace('&lt;','<').replace('&gt;','>')
            to_check = x('//ns1piid//text()')+x('//ns1signeddate//text()').split(' ')[0].strip()
            if to_check not in Existing_list:
                Items.append(Item(title = Title,link = Link,description = Description))
                Existing_list_1.append(to_check)

        if Items != []:        
            feed = Feed(title = "RSS FEEDS",
                        link = "http://167.99.192.145/fpds_rss_feed.xml",
                        description = "Customized RSS Feed",
                        generator = "Shekhar Samanta",
                        items = Items)
            with open("fpds_rss_feed.xml","w",encoding="utf-8")as export:
                export.write(feed.rss())
        with open('existing.csv','a',newline='')as export1:
            writer = csv.writer(export1)
            for Existing_one in Existing_list_1:
                if Existing_one not in Existing_list:
                    writer.writerow([Existing_one])

fpds = Scraper('https://www.fpds.gov/ezsearch/fpdsportal?s=FPDSNG.COM&indexName=awardfull&templateName=1.4.4&q=OBLIGATED_AMOUNT%3A%5B50000%2C%29+AND+PRINCIPAL_NAICS_CODE%3A%28541618+OR+541690+OR+541820+OR+541910+OR+541990+OR+561110+OR+561499+OR+561611+OR+561990+OR+921190+OR+922190+OR+923110+OR+923130+OR+928110+OR+928120%29+AND+AGENCY_CODE%3A%280559+OR+1100+OR+1145+OR+1153+OR+1204+OR+1301+OR+1544+OR+1549+OR+1550+OR+1900+OR+3400+OR+7003+OR+7009+OR+7022+OR+7100+OR+7200+OR+7505+OR+7523+OR+8000+OR+8300+OR+8900+OR+9543+OR+9577%29+PRODUCT_OR_SERVICE_CODE%3A%28+AJ96+OR+B506+OR+B507+OR+B522+OR+B544+OR+B548+OR+B549+OR+B550+OR+B551+OR+R405+OR+R406+OR+R407+OR+R408+OR+R409+OR+R412+OR+R419+OR+R422+OR+R423+OR+R426+OR+R499+OR+R699+OR+R706+OR+R707+OR+R708+OR+R799%29+AND+SIGNED_DATE%3A%5B2018%2F03%2F01%2C%29+AND+POP_STATE_NAME%3A%28%22VIRGINIA%22+OR+%22MARYLAND%22+OR+%22DISTRICT+OF+COLUMBIA%22%29+-%22DOMESTIC+AWARDEES+%28UNDISCLOSED%29%22+-%22FOREIGN+AWARDEES%22&rss=1&feed=atom0.3')
source = fpds.parse_xml()

I tried different HTML tags for newline to be acceptable by chrome but it does not work. Its the Description tag that has the newline character '\n but its not working

You will require a blank CSV file in same directory "existing.csv"

Shekhar Samanta
  • 875
  • 2
  • 12
  • 25

0 Answers0