-1

Hello have been trying to create random data with random dates as into a csv file but getting the following error expected str instance, numpy.datetime64 found

code for data generator

import pandas as pd
import numpy as np
import string
import random

def gen_random_email():
    domains = [ "hotmail.com", "gmail.com", "aol.com", "mail.com" , "mail.kz", "yahoo.com"]
    letters = string.ascii_letters +'.'*5
    
    email = ''.join(np.random.choice(list(letters),10))+'@'+ np.random.choice(domains)
    email = email.replace('.@', '@')
    return email, "Email"

def gen_random_float():
    num = np.random.random()*np.random.randint(2000)
    
    decimal_points = np.random.randint(8)
    num = int(num*10**(decimal_points))/10**decimal_points
    
    return str(num), 'Float'

def gen_random_sentence():
    nouns = ["puppy", "car", "rabbit", "girl", "monkey"]
    verbs = ["runs", "hits", "jumps", "drives", "barfs"]
    adv = ["crazily", "dutifully", "foolishly", "merrily", "occasionally"]
    adj = ["adorable.", "clueless.", "dirty.", "odd.", "stupid."]

    random_entry = lambda x: x[random.randrange(len(x))]
    
    random_entry = " ".join([random_entry(nouns), random_entry(verbs), 
                     random_entry(adv), random_entry(adj)])

    return random_entry, 'String'

def gen_random_int():
    num = np.random.randint(1000000)
    
    return str(num), 'Int'

def gen_random_date():
    monthly_days = np.arange(0, 30)
    base_date = np.datetime64('2020-01-01')
    random_date = base_date + np.random.choice(monthly_days)
    return random_date, 'Date'

def gen_dataset(filename, size=5000):
    randomizers = [gen_random_email, gen_random_float, gen_random_int, gen_random_sentence,gen_random_date]
    with open(filename, 'w') as file:
        file.write("Text, Type\n")
        for _ in range(size):
            file.write(",".join(random.choice(randomizers)())+"\n") 

gen_dataset('dataaaa.csv')   
    
TypeError: sequence item 0: expected str instance, numpy.datetime64 found 
thuyein
  • 1,684
  • 13
  • 29
  • When posting a question about code that produces an Exception, always include the complete Traceback - copy and paste it then format it as code (select it and type `ctrl-k`) – wwii Jun 22 '20 at 02:41
  • Any chance you could reduce that to a [mre]? – wwii Jun 22 '20 at 02:43

1 Answers1

0

First, catch the error and see what is causing it.

def gen_dataset(filename, size=5000):
    randomizers = [gen_random_email, gen_random_float, gen_random_int, gen_random_sentence,gen_random_date]
    with open(filename, 'w') as file:
        file.write("Text, Type\n")
        for _ in range(size):
            f = random.choice(randomizers)
            result = f()
            try:
                file.write(",".join(result)+"\n")
            except TypeError:
                print(result)
                raise

>>>
(numpy.datetime64('2020-01-09'), 'Date')
Traceback (most recent call last):
  File "C:\pyProjects\tmp.py", line 80, in <module>
    gen_dataset('dataaaa.csv')
  File "C:\pyProjects\tmp.py", line 75, in gen_dataset
    file.write(",".join(result)+"\n")
TypeError: sequence item 0: expected str instance, numpy.datetime64 found

hmmm, I wonder if join only except strings as arguments?

Yep, from the docs:

A TypeError will be raised if there are any non-string values in iterable, including bytes objects.

I wonder how I can turn a numpy datetime64 to a string. Searching with numpy datetime64 to string is productive: Convert numpy.datetime64 to string object in python

These work

>>> q = gen_random_date()[0]
>>> q
numpy.datetime64('2020-01-27')
>>> np.datetime_as_string(q)
'2020-01-27'
>>> q.astype(str)
'2020-01-27'
>>>

Then just modify the try/except.

def gen_dataset(filename, size=5000):
    randomizers = [gen_random_email, gen_random_float, gen_random_int, gen_random_sentence,gen_random_date]
    with open(filename, 'w') as file:
        file.write("Text, Type\n")
        for _ in range(size):
            f = random.choice(randomizers)
            a,b = f()
            try:
                q = ",".join([a,b,"\n"])
            except TypeError:
                a = np.datetime_as_string(a)
                q = ",".join([a,b,"\n"])
            file.write(q)

Or simply preemptively make the first item a string.

def gen_dataset(filename, size=5000):
    randomizers = [gen_random_email, gen_random_float, gen_random_int, gen_random_sentence,gen_random_date]
    with open(filename, 'w') as file:
        file.write("Text, Type\n")
        for _ in range(size):
            f = random.choice(randomizers)
            a,b = f()
            q = ",".join([str(a),b,"\n"])
            file.write(q)
wwii
  • 23,232
  • 7
  • 37
  • 77