Could you please advise how the following lines should be re-written based on http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
df.drop('PACKETS', axis=1, inplace=True)
produces
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
df.drop('PACKETS', axis=1, inplace=True)
/home/app/ip-spotlight/code/app/ipacc/plugin/ix.py:74: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
df.replace(numpy.nan, "", inplace=True)
produces
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
df.replace(numpy.nan, "", inplace=True)
/home/app/ip-spotlight/code/app/ipacc/plugin/ix.py:68: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
On the other hand, the following is an example of how it was re-written based on the above principle
df.loc[:, ('SRC_PREFIX')] = df[ ['SRC_NET', 'SRC_MASK'] ].apply(lambda x: "/".join(x), axis=1)
But i am unable to figure out how to re-write the cases 1 and 2 ?
EDIT: the code so far it looks like this (df
is the dataframe of interest). So initially the is some kind of casting:
df = pandas.DataFrame(data['payload'], columns=sorted(data['header'], key=data['header'].get))
df = df.astype({
'SRC_AS' : "object",
'DST_AS' : "object",
'COMMS' : "object",
'SRC_COMMS' : "object",
'AS_PATH' : "object",
'SRC_AS_PATH' : "object",
'PREF' : "object",
'SRC_PREF' : "object",
'MED' : "object",
'SRC_MED' : "object",
'PEER_SRC_AS' : "object",
'PEER_DST_AS' : "object",
'PEER_SRC_IP' : "object",
'PEER_DST_IP' : "object",
'IN_IFACE' : "object",
'OUT_IFACE' : "object",
'SRC_NET' : "object",
'DST_NET' : "object",
'SRC_MASK' : "object",
'DST_MASK' : "object",
'PROTOCOL' : "object",
'TOS' : "object",
'SAMPLING_RATE' : "uint64",
'EXPORT_PROTO_VERSION' : "object",
'PACKETS' : "object",
'BYTES' : "uint64",
})
Then the calculate
function of a module is called:
mod.calculate(data['identifier'], data['timestamp'], df)
And the calculate
function is defined like this:
def calculate(identifier, timestamp, df):
try:
# Filter based on AORTA IX.
lut_ipaddr = lookup_ipaddr()
df = df[ (df.PEER_SRC_IP.isin( lut_ipaddr )) ]
if df.shape[0] > 0:
logger.info('analyzing message `{}`'.format(identifier))
# Preparing for input.
df.replace("", numpy.nan, inplace=True)
# Data wrangling. Calculate traffic rate. Reduce.
df.loc[:, ('BPS')] = 8*df['BYTES']*df['SAMPLING_RATE']/300
df.drop(columns=['SAMPLING_RATE', 'EXPORT_PROTO_VERSION', 'PACKETS', 'BYTES'], inplace=True)
# Data wrangling. Formulate prefixes using CIDR notation. Reduce.
df.loc[:, ('SRC_PREFIX')] = df[ ['SRC_NET', 'SRC_MASK'] ].apply(lambda x: "/".join(x), axis=1)
df.loc[:, ('DST_PREFIX')] = df[ ['DST_NET', 'DST_MASK'] ].apply(lambda x: "/".join(x), axis=1)
df.drop(columns=['SRC_NET', 'SRC_MASK', 'DST_NET' ,'DST_MASK'], inplace=True)
# Populate using lookup tables.
df.loc[:, ('NETELEMENT')] = df['PEER_SRC_IP'].apply(lookup_netelement)
df.loc[:, ('IN_IFNAME')] = df.apply(lambda x: lookup_iface(x['NETELEMENT'], x['IN_IFACE']), axis=1)
df.loc[:, ('OUT_IFNAME')] = df.apply(lambda x: lookup_iface(x['NETELEMENT'], x['OUT_IFACE']), axis=1)
# df.loc[:, ('SRC_ASNAME')] = df.apply(lambda x: lookup_asn(x['SRC_AS']), axis=1)
# Add a timestamp.
df.loc[:, ('METERED_ON')] = arrow.get(timestamp, "YYYYMMDDHHmm").format("YYYY-MM-DD HH:mm:ss")
# Preparing for input.
df.replace(numpy.nan, "", inplace=True)
# Finalize !
return identifier, timestamp, df.to_dict(orient="records")
else:
logger.info('going through message `{}` no IX bgp/netflow data were found'.format(identifier))
except Exception as e:
logger.error('processing message `{}` at `{}` caused `{}`'.format(identifier,timestamp,repr(e)), exc_info=True)
return identifier, timestamp, None