Skip to content

items is 20%~30% slower than iteration via an index #14421

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
mratsim opened this issue May 21, 2020 · 7 comments · Fixed by #14447
Closed

items is 20%~30% slower than iteration via an index #14421

mratsim opened this issue May 21, 2020 · 7 comments · Fixed by #14447

Comments

@mratsim
Copy link
Collaborator

mratsim commented May 21, 2020

The items iterator for sequence is very slow compared to iterating via index.

This is particularly visible on objects that are around 1 cache-line (64 byte) (probably because they flush the items that were prefetched during the iteration.

import math, strformat
import random, times

proc warmup*() =
  # Warmup - make sure cpu is on max perf
  echo "Running warmup to ensure CPU is on max perf if using dynamic frequency"
  let start = cpuTime()
  var foo = 123
  for i in 0 ..< 300_000_000:
    foo += i*i mod 456
    foo = foo mod 789

  # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
  let stop = cpuTime()
  echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"

warmup()

# --------------------------------------------------------

type
  BigTypeSizeCacheLine = object
    # A big ype that almost fill a cache line
    x, y, z: float64
    r: float64
    u, v, w: float64

  VariantKind = enum
    kBig

  VariantCacheLine = object
    # A variant that takes the full cache line (due to padding)
    case kind: VariantKind
    of kBig:
      b: BigTypeSizeCacheLine

let n = 1_000_000_0000

proc initBigTypes(): seq[VariantCacheLine] =
  for i in 0 ..< 10000:
    result.add VariantCacheLine(
      kind: kBig,
      b: BigTypeSizeCacheLine(
        x: rand(1.0), y: rand(1.0), z: rand(1.0),
        r: rand(1.0),
        u: rand(1.0), v: rand(1.0), w: rand(1.0)
      )
    )

proc checkValue(big: BigTypeSizeCacheLine) =
  # Fake work that cannot be optimized away (without a prover)
  doAssert sqrt(
    big.x * big.y * big.z *
    big.r *
    big.u * big.v * big.w
  ) <= 1.0

proc checkValue(variant: VariantCacheLine) =
  case variant.kind
  of kBig:
    checkValue(variant.b)

proc with_iterator(): float64 =
  let s = initBigTypes()

  let t = cpuTime()
  for big in s: # <-------------------
    checkValue(big)

  result = cpuTime() - t

proc with_index(): float64 =
  let s = initBigTypes()

  let t = cpuTime()
  for i in 0 ..< s.len:
    checkValue(s[i])

  result = cpuTime() - t

proc main_cacheline() =
  let it = with_iterator()
  let idx = with_index()

  echo "---------- 56 bytes (+1-byte tag + padding) ----------------"
  echo "Iterator: ", round(it*1e3, 3), " ms"
  echo "Index: ", round(idx*1e3, 3), " ms"
  echo "Speedup: ", round((it - idx)/it * 100, 2), "%"

main_cacheline()

# -------------------------------------------------------------

type
  CryptoPrimitive = object
    # A 512-bit cryptographic primitive (for example a message signature)
    hash: array[64, byte]


  VariantCrypto = object
    # A variant that distinguish between raw unvalidated data
    # and crypto verified data
    case validated: bool
    of true:
      sig: CryptoPrimitive
    of false:
      raw: array[64, byte]

proc initCrypto(): seq[VariantCrypto] =
  for i in 0 ..< 1024:
    result.add VariantCrypto(
      validated: bool(i mod 2)
    )
    if result[^1].validated:
      for j in 0..<64:
        result[^1].sig.hash[j] = byte rand(255)
    else:
      for j in 0..<64:
        result[^1].raw[j] = byte rand(255)

proc checkValue(big: VariantCrypto) =
  # Fake work that cannot be optimized away (without a prover)
  var sum = 0
  if big.validated:
    for i in 0 ..< 64:
      sum += int(big.sig.hash[i])
  else:
    for i in 0 ..< 64:
      sum += int(big.raw[i])

  doAssert sum <= 255 * 64

proc with_iterator_crypto(): float64 =
  let s = initCrypto()

  let t = cpuTime()
  for big in s: # <-------------------
    checkValue(big)

  result = cpuTime() - t

proc with_index_crypto(): float64 =
  let s = initCrypto()

  let t = cpuTime()
  for i in 0 ..< s.len:
    checkValue(s[i])

  result = cpuTime() - t

proc main_crypto() =
  let it = with_iterator_crypto()
  let idx = with_index_crypto()

  echo "---------- 64 bytes (+1-byte tag) ----------------"
  echo "Iterator: ", round(it*1e3, 3), " ms"
  echo "Index: ", round(idx*1e3, 3), " ms"
  echo "Speedup: ", round((it - idx)/it * 100, 2), "%"

main_crypto()

Benchmark on my machine, repeated 5 times via a bash for loop
image

Looking into the generated C code (with -d:danger) this is due to an extra assignment:

N_LIB_PRIVATE N_NIMCALL(NF, with_iterator_crypto__7ulxTAN69bbjasPZaf5U4Dw_3)(void) {
	NF result;
	tySequence__iksy5MLCk8g4iIFr3fb9bPw* s;
	NF t;
	NF T5_;
	result = (NF)0;
	s = initCrypto__dx4Aan6xsXcnZXf3lg5Z5w();
	t = cpuTime__9aodCrWXscOGeNVh2cpuZkw();
	{
		tyObject_VariantCrypto__K8mK9cICDx26SlWO8siUc9cw big;
		NI i;
		NI L;
		NI T2_;
		nimZeroMem((void*)(&big), sizeof(tyObject_VariantCrypto__K8mK9cICDx26SlWO8siUc9cw));
		i = ((NI) 0);
		T2_ = (s ? s->Sup.len : 0);
		L = T2_;
		{
			while (1) {
				if (!(i < L)) goto LA4;
				big = s->data[i];  // <------------------
				checkValue__Y9b2ugRAezbQJvLN60hHNmw((&big));  // <------------------
				i += ((NI) 1);
			} LA4: ;
		}
	}
	T5_ = (NF)0;
	T5_ = cpuTime__9aodCrWXscOGeNVh2cpuZkw();
	result = ((NF)(T5_) - (NF)(t));
	return result;
}
N_LIB_PRIVATE N_NIMCALL(NF, with_index_crypto__7ulxTAN69bbjasPZaf5U4Dw_4)(void) {
	NF result;
	tySequence__iksy5MLCk8g4iIFr3fb9bPw* s;
	NF t;
	NF T5_;
	result = (NF)0;
	s = initCrypto__dx4Aan6xsXcnZXf3lg5Z5w();
	t = cpuTime__9aodCrWXscOGeNVh2cpuZkw();
	{
		NI i;
		NI colontmp_;
		NI T2_;
		NI i_2;
		i = (NI)0;
		colontmp_ = (NI)0;
		T2_ = (s ? s->Sup.len : 0);
		colontmp_ = T2_;
		i_2 = ((NI) 0);
		{
			while (1) {
				if (!(i_2 < colontmp_)) goto LA4;
				i = i_2;
				checkValue__Y9b2ugRAezbQJvLN60hHNmw((&s->data[i])); // <------------------
				i_2 += ((NI) 1);
			} LA4: ;
		}
	}
	T5_ = (NF)0;
	T5_ = cpuTime__9aodCrWXscOGeNVh2cpuZkw();
	result = ((NF)(T5_) - (NF)(t));
	return result;
}

Recommendation

The iterator should return lent which is dependent on #14420

@cooldome
Copy link
Member

cooldome commented May 21, 2020

I have implemented lent T support for iterators. There is another bummer: lent T can't be captured by closures now. This breaks some packages.
I guess we also need nim-lang/RFCs#178 to be implemented to make it happen.

@timotheecour
Copy link
Member

note; with var s = initBigTypes() and for big in mitems(s):, speed is same as with index

@mratsim
Copy link
Collaborator Author

mratsim commented May 22, 2020

note; with var s = initBigTypes() and for big in mitems(s):, speed is same as with index

This supports that lent will solve the issue, mitems will iterate with an address instead of a copy.

@Araq
Copy link
Member

Araq commented May 22, 2020

Fwiw I am using mitems for speed more and more often even if I don't need the mutability. Not good.

@bung87
Copy link
Collaborator

bung87 commented May 22, 2020

Can compiler optimizing it automaticly? if developer not intend to mutate data they may not choose to using mitems

@ghost
Copy link

ghost commented May 22, 2020

@bung87 that's what lent annotations should help with :)

@timotheecour
Copy link
Member

see #14447 which implements that; only 1 test failing (zero-functional); help welcome (see #14447 (comment))

timotheecour added a commit to timotheecour/Nim that referenced this issue May 28, 2020
Araq pushed a commit that referenced this issue May 29, 2020
* fix #14421 items uses lent T for seq + openArray
* add -d:nimWorkaround14447
* fix test
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging a pull request may close this issue.

5 participants