Scatter Plot and trendlines for data subsetted on 2 levels

Question

I have a dataset (dataset at end of question) of 2 Groups (group1, group2) of 4 scatter measurements each (R1,R2,R3,R4). I want to plot:

Scatter of each measurement (R1, R2, R3, R4)
The Trendline of each measurement (R1,R2, R3, R4)
The Average line for each group (group1, group2)
Filled colored error area for each average line (average line +- stddev)

Here is what the plot should look like

As you can see from the plot above, I have successfully plotted it (dataset and working example below); however the code is very long, maybe there is a simpler way in plotly to do this? I think that "Error area, trendlines for 2 groups" is something that a lot of people would do.

Code

import pandas as pd
 #model fitting 
from scipy import stats
import numpy as np
 #plotting 
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

 #import the data 
df = pd.read_csv("/content/dfs.csv", sep=None)


 #1 Fun: fit the data
def model(df_):
  y = df_["y"].values
  X = df_["x"].values
  mask = ~np.isnan(X) & ~np.isnan(y)
   #eliminate values with nan #! this shouldn't exist in my case ? 
  reg = stats.linregress(X[mask], y[mask])
  return (reg)

 #2 Fun: make average fit and upper lower bounds
def get_average_line (df_, results_ ):
    
  name_ = df_["Group"].iloc[0] 
  r=results_.query("Group == @name_").describe() #This works but is a patch 
  x = df_["x"].unique()
  y=r["m"]["mean"]*x + r["x"]["mean"]
  y_upper = (r["m"]["mean"]+r["m"]["std"]) *x + (r["x"]["mean"]+r["x"]["std"])
  y_lower = (r["m"]["mean"]-r["m"]["std"]) *x + (r["x"]["mean"]-r["x"]["std"])
  average_line = pd.DataFrame({"x": x, "y":y, "y_upper":y_upper, "y_lower":y_lower})
  return average_line

 #5 plot the upper and lower bound 
def plot_avearge_lines (df_, name_line ="Group", row=1, col=1):
  #Assume a figure is already started
   #name to use in addition for plotting 
  name = " " + df_["Group"].iloc[0]

  fig.add_trace(go.Scatter(
        name='Average' + name, 
        x=df_["x"],
        y=df_["y"],
        mode='lines',
        line=dict(color='rgb(31, 119, 180)'),
        legendgroup=name
    ),
              row=row, col=col)
  
   #uppet bound
  fig.add_trace(go.Scatter(
        name='Upper Bound'+ name ,
        x=df_["x"],
        y=df_['y_upper'],
        mode='lines',
        marker=dict(color="#444"),
        line=dict(width=0),
        showlegend=False,
        legendgroup=name
    ),
              row=row, col=col)
  
   #lower bound - fill = tonexty will make it fill until line before it
  fig.add_trace(go.Scatter(
        name='Lower Bound' + name ,
        x=df_["x"],
        y=df_['y_lower'],
        marker=dict(color="#444"),
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(68, 68, 68, 0.3)',
        fill='tonexty',
        showlegend=False, 
        legendgroup=name
    ),
              row=row, col=row)
 
 #scatter of each line (R1, R2, R3, R4)
def plot_scatter_and_fit (df_ , color ="name", row=1, col=1):
  for t in px.scatter(df_, x="x", y="y", color=color, title='',  trendline="ols" ).data:
      fig.add_trace(t, row=1, col=1)


##### MAIN ##########
 #1fit the data  
df_grouped_Group_mes = df.groupby(["Group", "Mes"])
results = df_grouped_Group_mes.apply(model).reset_index().rename(columns={0:'fit_results'})
results = pd.concat([results, pd.DataFrame([(reg.intercept, reg.slope, reg.rvalue, reg.pvalue, reg.stderr)  for reg in results["fit_results"]], columns=["x", "m", "r-value", "p-value", "y-std"])], axis=1)
 #2 make average fit and upper lower bounds
df_grouped_Group = df.groupby(["Group"])
avearge_lines = df_grouped_Group.apply(get_average_line, results_=results).reset_index()

 #3 plot a figure with..
fig = make_subplots(
    rows=1,
    cols=1,
    shared_xaxes=False,
    vertical_spacing=0.03,
    specs=[[{"type": "scatter"}]], #, {"type": "table"}
)



 #4 scatter of each line (R1, R2, R3, R4)
df_grouped_Group.apply(plot_scatter_and_fit,  color ="name", row=1, col=1)
 
 #5 plot the upper and lower bound
averge_line_grouped_name = avearge_lines.groupby(["Group"])
averge_line_grouped_name.apply(plot_avearge_lines, name_line ="name", row=1, col=1)

 #layout 
fig.update_layout(
    yaxis_title='y',
    xaxis_title ="x",
    title='error area and trendlines for 2 groups of scatters series',
    width=800,
    height=800,
    hovermode="x",
    xaxis = dict(
        tickmode = 'array',
        tickvals = df["x"]        
    ),
)

fig.show()

Dataset

 Group Mes      name     x     y
group1  R1 group1-R1  2.00   3.0
group1  R1 group1-R1 13.00  50.0
group1  R1 group1-R1 25.00  78.0
group1  R1 group1-R1 37.00 130.0
group2  R1 group2-R1  2.00  35.0
group2  R1 group2-R1  6.00  63.0
group2  R1 group2-R1 10.00  93.0
group2  R1 group2-R1 22.00 323.0
group2  R1 group2-R1 34.00 455.0
group1  R2 group1-R2  5.00  16.0
group1  R2 group1-R2 16.00  76.0
group1  R2 group1-R2 28.00 110.0
group1  R2 group1-R2 40.00 153.0
group2  R2 group2-R2  3.00  47.0
group2  R2 group2-R2  7.13  68.0
group2  R2 group2-R2 13.00 111.0
group2  R2 group2-R2 25.00 353.0
group2  R2 group2-R2 37.00 493.0
group1  R3 group1-R3  8.00  45.0
group1  R3 group1-R3 19.00  82.0
group1  R3 group1-R3 31.00 128.0
group2  R3 group2-R3  0.00  12.0
group2  R3 group2-R3  4.00  53.0
group2  R3 group2-R3  8.00  80.0
group2  R3 group2-R3 16.00 121.0
group2  R3 group2-R3 28.00 394.0
group2  R3 group2-R3 40.00 521.0
group1  R4 group1-R4 11.00  45.0
group1  R4 group1-R4 22.00  90.0
group1  R4 group1-R4 34.00 128.0
group2  R4 group2-R4  1.00  13.0
group2  R4 group2-R4  5.00  71.0
group2  R4 group2-R4  9.00   NaN
group2  R4 group2-R4 19.00 139.0
group2  R4 group2-R4 31.00 400.0

Derek O · Answer 1 · 2022-03-02T00:51:19.937

The only thing I can think of is that you are using the .add_trace method to plot the average, upper, and lower bounds and calling this method three times. You can condense this down by using the zip function to iterate through the different arguments at once for each of the traces at the same time, and therefore you only need to write out .add_trace once.

Personally I consider this harder to read than what you already have and it doesn't do anything different from your code or save time, but it is a bit more succinct (20 lines versus 40 lines)

NOTE: I can't reproduce your exact plot because I believe the data sample you included isn't the full dataset. However, I checked that your original plot_average_lines function and my plot_average_lines function produce the same plot.

def plot_average_lines (df_, name_line ="Group", row=1, col=1):
    name = " " + df_["Group"].iloc[0]
    for bound_type, y_col, marker, line, showlegend, fillcolor, fill in zip(
        ['Average','Upper Bound','Lower Bound'],
        ["y","y_upper","y_lower"],
        [None, dict(color="#444"), dict(color="#444")],
        [dict(color='rgb(31, 119, 180)'), dict(width=0), dict(width=0)],
        [True, False, False],
        [None, None, 'rgba(68, 68, 68, 0.3)'],
        [None, None, 'tonexty']
    ):
        fig.add_trace(go.Scatter(
            name=bound_type + name ,
            x=df_["x"],
            y=df_[y_col],
            marker=marker,
            line=line,
            mode='lines',
            fillcolor=fillcolor,
            fill=fill,
            showlegend=showlegend, 
            legendgroup=name
        ),
              row=row, col=row)

The result is unchanged:

Thanks @Derek O, as you said i removed a bit of data just so that the idea would be reproducible. OK so there is no way of doing this with px.scatter(trendline, error bars shaded, grouped by 2 sets), thanks, — Leo, Mar 02 '22 at 07:03
@Leo you could use `px.scatter` but you would still have to add trendlines and shading as separate traces (and adding traces with plotly express is [somewhat inconvenient](https://stackoverflow.com/questions/62122015/how-to-add-traces-in-plotly-express) and probably requires more code than using `go.scatter` for what you are trying to achieve) — Derek O, Mar 02 '22 at 17:48

Scatter Plot and trendlines for data subsetted on 2 levels

1 Answers1