I have the following problem. I want to extract data from hdfs (a table called 'complaint'). I wrote the following script which actually works:
import pandas as pd
from hdfs import InsecureClient
import os
file = open ("test.txt", "wb")
print ("Step 1")
client_hdfs = InsecureClient ('http://XYZ')
N = 10
print ("Step 2")
with client_hdfs.read('/user/.../complaint/000000_0') as reader:
print('new line')
features = reader.read(1000000)
file.write(features)
print('end')
file.close()
My problem now is that the folder "complaint" contains 4 files ( i don't know which file type) and the read operation gives me back bytes which I can't use further (I saved it to a textfile as a test and it looks like that:
My question now is: Is it possible to get the data separated for each column in a senseful way?
I only found solutions with .csv files and like that and somehow stuck here... :-)
EDIT I made changes to my solution and tried different approaches but none of them is going to work really. Here's the updated code:
import pandas as pd
from hdfs import InsecureClient
import os
import pypyodbc
import pyspark
from pyspark import SparkConf, SparkContext
from hdfs3 import HDFileSystem
import pyarrow.parquet as pq
import pyarrow as pa
from pyhive import hive
#Step 0: Configurations
#Connections with InsecureClient (this basically works)
#Notes: TMS1 doesn't work because of txt files
#insec_client_tms1 = InsecureClient ('http://some-adress:50070')
insec_client_tms2 = InsecureClient ('http://some-adress:50070')
#Connection with Spark (not working at the moment)
#Error: Java gateway process exited before sending its port number
#conf = SparkConf().setAppName('TMS').setMaster('spark://adress-of-node:7077')
#sc = SparkContext(conf=conf)
#Connection via PyArrow (not working)
#Error: File not found
#fs = pa.hdfs.connect(host='hdfs://node-adress', port =8020)
#print("FS: " + fs)
#connection via HDFS3 (not working)
#The module couldn't be load
#client_hdfs = HDFileSystem(host='hdfs://node-adress', port=8020)
#Connection via Hive (not working)
#no module named sasl -> I tried to install it, but it also fails
#conn = hive.Connection(host='hdfs://node-adress', port=8020, database='deltatest')
#Step 1: Extractions
print ("starting Extraction")
#Create file
file = open ("extraction.txt", "w")
#Extraction with Spark
#text = sc.textFile('/user/hive/warehouse/XYZ.db/baseorder_flags/000000_0')
#first_line = text.first()
#print (first_line)
#extraction with hive
#df = pd.read_sql ('select * from baseorder',conn)
#print ("DF: "+ df)
#extraction with hdfs3
#with client_hdfs.open('/home/deltatest/basedeviation/000000_0') as f:
# df = pd.read_parquet(f)
#Extraction with Webclient (not working)
#Error: Arrow error: IOError: seek -> fastparquet has a similar error
with insec_client_tms2.read('/home/deltatest/basedeviation/000000_0') as reader:
features = pd.read_parquet(reader)
print (features)
#features = reader.read()
#data = features.decode('utf-8', 'replace')
print("saving data to file")
file.write(data)
print('end')
file.close()