8

As an exercise to learn Rust I decided to implement a Bit Vector library, with inspiration from std::vec::Vec for which methods to provide.

I have the following code:

extern crate num;

use std::cmp::Eq;
use std::ops::{BitAnd,BitOrAssign,Index,Shl};
use num::{One,Zero,Unsigned,NumCast};

pub trait BitStorage: Sized + 
    BitAnd<Self, Output = Self> + 
    BitOrAssign<Self> + 
    Shl<Self, Output = Self> + 
    Eq + Zero + One + Unsigned + NumCast + Copy {}

impl<S> BitStorage for S where S: Sized + 
    BitAnd<S, Output = S> + 
    BitOrAssign<S> + 
    Shl<S, Output = S> + 
    Eq + Zero + One + Unsigned + NumCast + Copy {}

pub struct BitVector<S: BitStorage> {
    data: Vec<S>,
    capacity: usize,
    storage_size: usize
}

impl<S: BitStorage> BitVector<S> {
    pub fn with_capacity(capacity: usize) -> BitVector<S> {
        let storage_size = std::mem::size_of::<S>() * 8;
        let len = (capacity / storage_size) + 1;
        BitVector { 
            data: vec![S::zero(); len],
            capacity: capacity,
            storage_size: storage_size
        }
    }

    pub fn get(&self, index: usize) -> Option<bool> {
        match self.index_in_bounds(index) {
            true => Some(self.get_unchecked(index)),
            false => None
        }
    }

    pub fn set(&mut self, index: usize, value: bool) {
        self.panic_index_bounds(index);
        let (data_index, remainder) = self.compute_data_index_and_remainder(index);
        let value = if value { S::one() } else { S::zero() };
        self.data[data_index] |= value << remainder;
    }

    pub fn capacity(&self) -> usize {
        self.capacity
    }

    pub fn split_at(&self, index: usize) -> (&BitVector<S>, &BitVector<S>) {
        self.panic_index_not_on_storage_bound(index);
        let data_index = self.compute_data_index(index);
        let (capacity_left, capacity_right) = self.compute_capacities(index);
        let (data_left, data_right) = self.data.split_at(data_index);

        let left = BitVector {
            data: data_left.to_vec(),
            capacity: capacity_left,
            storage_size: self.storage_size
        };
        let right = BitVector {
            data: data_right.to_vec(),
            capacity: capacity_right,
            storage_size: self.storage_size
        };
        (&left, &right)
    }

    pub fn split_at_mut(&mut self, index: usize) -> (&mut BitVector<S>, &mut BitVector<S>) {
        self.panic_index_not_on_storage_bound(index);
        let data_index = self.compute_data_index(index);
        let (capacity_left, capacity_right) = self.compute_capacities(index);
        let (data_left, data_right) = self.data.split_at_mut(data_index);

        let mut left = BitVector {
            data: data_left.to_vec(),
            capacity: capacity_left,
            storage_size: self.storage_size
        };
        let mut right = BitVector {
            data: data_right.to_vec(),
            capacity: capacity_right,
            storage_size: self.storage_size
        };
        (&mut left, &mut right)
    }

    #[inline]
    fn get_unchecked(&self, index: usize) -> bool {
        let (data_index, remainder) = self.compute_data_index_and_remainder(index);
        (self.data[data_index] & (S::one() << remainder)) != S::zero()
    }

    #[inline]
    fn compute_data_index_and_remainder(&self, index: usize) -> (usize, S) {
        let data_index = self.compute_data_index(index);
        let remainder = self.compute_data_remainder(index);
        (data_index, remainder)
    }

    #[inline]
    fn compute_data_index(&self, index: usize) -> usize {
        index / self.storage_size
    }

    #[inline]
    fn compute_data_remainder(&self, index: usize) -> S {
        let remainder = index % self.storage_size;
        // we know that remainder is always smaller or equal to the size that S can hold
        // for example if S = u8 then remainder <= 2^8 - 1
        let remainder: S = num::cast(remainder).unwrap();
        remainder
    }

    #[inline]
    fn compute_capacities(&self, index_to_split: usize) -> (usize, usize) {
        (index_to_split, self.capacity - index_to_split)
    }

    #[inline]
    fn index_in_bounds(&self, index: usize) -> bool {
        index < self.capacity
    }

    #[inline]
    fn panic_index_bounds(&self, index: usize) {
        if !self.index_in_bounds(index) {
            panic!("Index out of bounds. Length = {}, Index = {}", self.capacity, index);
        }
    }

    #[inline]
    fn panic_index_not_on_storage_bound(&self, index: usize) {
        if index % self.storage_size != 0 {
            panic!("Index not on storage bound. Storage size = {}, Index = {}", self.storage_size, index);
        }
    }
}

static TRUE: bool = true;
static FALSE: bool = false;

macro_rules! bool_ref {
    ($cond:expr) => (if $cond { &TRUE } else { &FALSE })
}

impl<S: BitStorage> Index<usize> for BitVector<S> {
    type Output = bool;

    fn index(&self, index: usize) -> &bool {
        self.panic_index_bounds(index);
        bool_ref!(self.get_unchecked(index))
    }
}

The compiler errors occur at the split_at and split_at_mut methods: They basically tell me that left and right in both cases do not live long enough to be returned as a reference. I understand this, because they are created on the stack and then I want to return them as a reference.

However with my design being inspired by std::vec::Vec you can see that in the SliceExt trait their definitions are as follows:

#[stable(feature = "core", since = "1.6.0")]
fn split_at(&self, mid: usize) -> (&[Self::Item], &[Self::Item]);

#[stable(feature = "core", since = "1.6.0")]
fn split_at_mut(&mut self, mid: usize) -> (&mut [Self::Item], &mut [Self::Item]);

I suppose this is done for end user convenience as they rather deal with references than with boxes.

I think I could fix my errors by putting the returned bit vectors into a Box<_>, but is there a way to return the created structs as a reference?

As a bonus question: It does work if I return (BitVector<S>, BitVector<S>), what are the downsides to doing this? Why does the SliceExt trait not do this?

Shepmaster
  • 388,571
  • 95
  • 1,107
  • 1,366
skiwi
  • 66,971
  • 31
  • 131
  • 216
  • 1
    Please produce an [MCVE]. – Shepmaster Jun 13 '16 at 12:46
  • 4
    @Shepmaster I do not see a way to shorten the code in question while still retaining the spirit of the question, ie. that it is a bit vector and how it relates to `std::vec::Vec` and the `SliceExt` trait. – skiwi Jun 13 '16 at 12:48
  • 1
    `get`, `set`, `capacity` functions are irrelevant. Remove arguments, generic types. End up with [this](https://play.rust-lang.org/?gist=aa638e102672e09ebcce3098762cf947&version=stable&backtrace=0). – Shepmaster Jun 13 '16 at 13:03

2 Answers2

10

How to return a newly created struct as a reference?

You can't. No way around this; it's simply impossible. As you said, if it's declared on the stack then the value will be dropped and any references would be invalidated.

So what makes Vec different?

A Vec<T> is the owned counterpart of a slice (&[T]). While a Vec has a pointer to the beginning of data, a count, and a capacity, a slice only has the pointer and a count. Both guarantee that all the data is contiguous. In pseudo-Rust, they look like this:

struct Vec<T> {
    data: *mut T,
    size: usize,
    capacity: usize,
}

struct Slice<'a, T> {
    data: *mut T,
    size: usize,
}

Vec::split_at can return slices because it essentially contains a slice. It's not creating something and returning a reference to it, it's just a copy of the pointer and the count.

If you create a borrowed counterpart to your owned datatype, then you could return that. Something like

struct BitVector {
    data: Vec<u8>,
    capacity: usize,
    storage_size: usize
}

struct BitSlice<'a> {
    data: &'a [u8],
    storage_size: usize,
}

impl BitVector {
    fn with_capacity(capacity: usize) -> BitVector {
        let storage_size = std::mem::size_of::<u8>() * 8;
        let len = (capacity / storage_size) + 1;
        BitVector { 
            data: vec![0; len],
            capacity: capacity,
            storage_size: storage_size
        }
    }

    fn split_at<'a>(&'a self) -> (BitSlice<'a>, BitSlice<'a>) {
        let (data_left, data_right) = self.data.split_at(0);
        let left = BitSlice {
            data: data_left,
            storage_size: self.storage_size
        };
        let right = BitSlice {
            data: data_right,
            storage_size: self.storage_size
        };
        (left, right)
    }
}

fn main() {}

To follow the theme of Vec, you would want to probably Deref and DerefMut to the BitSlice and then implement all the non-capacity-changing methods on the BitSlice.

I suppose this is done for end user convenience as they rather deal with references than with boxes.

References and boxes should be mostly transparent at the use-site. The main reason is performance. A Box is heap-allocated.

I think I could fix my errors by putting the returned bit vectors into a Box<_>

This would not be a good idea. You already have a heap allocation via the Vec, and boxing it would introduce another indirection and extra heap usage.

It does work if I return (BitVector<S>, BitVector<S>), what are the downsides to doing this? Why does the SliceExt trait not do this?

Yes, here you are returning the heap-allocated structures. There's no downside to returning these, there's just the downside of performing the allocations. That's why SliceExt doesn't do it.

Does this also directly translate to the split_at_mut variant?

Yes.

struct BitSliceMut<'a> {
    data: &'a mut [u8],
    storage_size: usize,
}

fn split_at_mut<'a>(&'a mut self) -> (BitSliceMut<'a>, BitSliceMut<'a>) {
    let (data_left, data_right) = self.data.split_at_mut (0);
    let left = BitSliceMut {
        data: data_left,
        storage_size: self.storage_size
    };
    let right = BitSliceMut {
        data: data_right,
        storage_size: self.storage_size
    };
    (left, right)
}

This helps point out that &T and &mut T are different types and behave in different ways.

it's not allowed to give (mut BitSlice<'a>, mut BitSlice<'a> as return type.

It doesn't make sense to return a mut T: What's the difference in `mut` before a variable name and after the `:`?. With a BitSliceMut, the mutability is an aspect of the contained type (&mut [u8]).

Community
  • 1
  • 1
Shepmaster
  • 388,571
  • 95
  • 1,107
  • 1,366
  • Does this also directly translate to the `split_at_mut` variant? Because that seems to be the most interesting case, yet it's not allowed to give `(mut BitSlice<'a>, mut BitSlice<'a>` as return type. – skiwi Jun 13 '16 at 18:03
  • @skiwi added more. – Shepmaster Jun 13 '16 at 18:28
  • I think I'm beginning to understand it now, thanks for the explanations. So this is really the best I can get? Because slices are a no-go as this structure represents a bit vector which cannot be represented as a slice (because even `bool` uses `u8` as backing). – skiwi Jun 13 '16 at 18:32
  • After trying to implement this and failing I've come to the following (hopefully temporary) conclusion: It's not possible to implement the `Deref` and `DerefMut` traits because you still end up trying to create a `BitSlice` resp `BitSliceMut` and returning a reference to something on the stack does not work. – skiwi Jun 13 '16 at 19:44
3

The answer to why the standard library is 'allowed' to return by reference is, that it does not allocate anything on the stack. It returns references to already allocated memory which lives long enough.

So you have basically two choices:

  • If you allocate memory on the stack you have to return it as value. This includes the Box<_> scenario. You return the Box, which has a pointer to heap allocated memory, as value.

  • If you do not allocate memory on the stack you can return references to the result which already lives in memory.

In Rust it is efficient to return by value, as the value is moved, not copied.

JDemler
  • 1,246
  • 14
  • 22