- 01
 - 02
 - 03
 - 04
 - 05
 - 06
 - 07
 - 08
 - 09
 - 10
 - 11
 - 12
 - 13
 - 14
 - 15
 - 16
 - 17
 - 18
 - 19
 - 20
 - 21
 - 22
 - 23
 - 24
 - 25
 - 26
 - 27
 - 28
 - 29
 - 30
 - 31
 - 32
 - 33
 - 34
 - 35
 - 36
 - 37
 - 38
 - 39
 - 40
 - 41
 - 42
 - 43
 - 44
 - 45
 - 46
 - 47
 - 48
 - 49
 - 50
 - 51
 - 52
 - 53
 - 54
 - 55
 - 56
 - 57
 - 58
 - 59
 - 60
 - 61
 - 62
 - 63
 - 64
 - 65
 - 66
 - 67
 - 68
 - 69
 - 70
 - 71
 - 72
 - 73
 - 74
 - 75
 - 76
 - 77
 - 78
 - 79
 - 80
 - 81
 - 82
 - 83
 - 84
 - 85
 - 86
 - 87
 - 88
 
                        if (neuronsV.size() >= 8)
	{
		auto count = neuronsV.size() - neuronsV.size() % 8;
		__m256* vs = static_cast<__m256*>(alloca(count * sizeof(float) * 3));
		__m256* ws = vs + count / 8;
		__m256* ins = ws + count / 8;
		for (int i = 0; i < count / 8; ++i)
		{
			vs[i] = _mm256_load_ps(&neuronsV[i * 8]);
			ws[i] = _mm256_load_ps(&neuronsW[i * 8]);
			ins[i] = _mm256_load_ps(&neuronsIn[i * 8]);
		}
		static const __m256 div3 = [](void) -> __m256
		{
			float temp[]{ 3.f, 3.f, 3.f, 3.f, 3.f, 3.f, 3.f, 3.f };
			return _mm256_load_ps(temp);
		}();
		__m256 iextv = [&iext](void) -> __m256
		{
			float temp[]{ iext, iext, iext, iext, iext, iext, iext, iext };
			return _mm256_load_ps(temp);
		}();
		__m256 exprdtv = [&dt](void) -> __m256
		{
			float temp[]{ expr * dt, expr * dt, expr * dt, expr * dt, expr * dt, expr * dt, expr * dt, expr * dt };
			return _mm256_load_ps(temp);
		}();
		static const __m256 av = [](void) -> __m256
		{
			float temp[]{ a, a, a, a, a, a, a, a };
			return _mm256_load_ps(temp);
		}();
		static const __m256 bv = [](void) -> __m256
		{
			float temp[]{ b, b, b, b, b, b, b, b };
			return _mm256_load_ps(temp);
		}();
		__m256 thetadt = [&dt](void) -> __m256
		{
			float temp[]{ dt / theta, dt / theta, dt / theta, dt / theta, dt / theta, dt / theta, dt / theta, dt / theta };
			return _mm256_load_ps(temp);
		}();
		for (int i = 0; i < count / 8; ++i)
		{
			// vs += (vs - (vs * vs * vs) / 3.f - ws - iext - ins) * exprdtv
			__m256 nv = _mm256_mul_ps(vs[i], _mm256_mul_ps(vs[i], vs[i]));
			nv = _mm256_sub_ps(vs[i], _mm256_div_ps(nv, div3));
			nv = _mm256_sub_ps(nv, ws[i]);
			nv = _mm256_add_ps(nv, iextv);
			nv = _mm256_add_ps(nv, ins[i]);
			nv = _mm256_mul_ps(nv, exprdtv);
			vs[i] = _mm256_add_ps(vs[i], nv);
			// ws += (vs - av - ws * bv) * thetadt
			nv = _mm256_sub_ps(vs[i], av);
			nv = _mm256_sub_ps(nv, _mm256_mul_ps(ws[i], bv));
			ws[i] = _mm256_add_ps(ws[i], _mm256_mul_ps(nv, thetadt));
		}
		for (int i = 0; i < count / 8; ++i)
		{
			_mm256_storeu_ps(&neuronsV[i * 8], vs[i]);
			_mm256_storeu_ps(&neuronsW[i * 8], ws[i]);
		}
	}
	// scalar edition
	for (int i = 0; i < neuronsV.size() % 8; ++i)
	{
		auto off = neuronsV.size() - 1 - i;
		auto& v = neuronsV[off];
		auto& w = neuronsW[off];
		auto& in = neuronsIn[off];
		v += (v - (v * v * v) / 3.f - w - iext - in) * expr * dt;
		w += (v - a - w * b) * dt / theta;
	}
	std::ranges::fill(neuronsIn, 0.f);