double vs. float

This example program will show you, that using double on the MegaAVR microcontroller will not provide you any additional precision. Double and float are the same and only offer 32-bit single precision mathematics.

This program is not an edge case, I tried not to create an example where the built-in float/double math will fail enormously. It’s more an everyday’s calculation, e.g. where you have to handle results from a GPS sensor or where you are doing some calendar/clock calculations, dealing both with months, days but also seconds and maybe even milliseconds. A year has already more than 31.5 million seconds, that is already 7 orders of magnitude.

/* Copyright (c) 2019-2025  Uwe Bissinger */

#include <fp64lib.h>

void setup() {
	Serial.begin(57600);
	
	volatile float aa, a;
	volatile double bb, b;    
	volatile float64_t cc, c; 

	a = aa = 1.234567890123456;
    b = bb = 1.234567890123456;
    c = cc = fp64_atof("1.234567890123456" );
	
	Serial.println( "double is the same as float" );
	Serial.print( "    sizeof(float) = " ); Serial.println( sizeof(float) );
	Serial.print( "   sizeof(double) = " ); Serial.println( sizeof(double) );
	Serial.print( "sizeof(float64_t) = " ); Serial.println( sizeof(float64_t) );
	Serial.println();

	Serial.println( "Var should be 1.234567890123456" );
	Serial.print("    float a = "); Serial.println( a, 15 );
	Serial.print("   double a = "); Serial.println( b, 15 );
	Serial.print("float64_t c = "); Serial.println( fp64_to_string(c,17,15) );
	Serial.println();

	Serial.println( "Add 1e-5 = 0.00001\nRes should be 1.234577890123456" );
	a += 1e-5;
	b += 1e-5;
	c = fp64_add( c, fp64_atof("1e-5") );
	Serial.print("    float a = "); Serial.println( a, 15 );
	Serial.print("   double a = "); Serial.println( b, 15 );
	Serial.print("float64_t c = "); Serial.println( fp64_to_string(c,17,15) );
	Serial.println();
	
	Serial.println( "Add 1000\nRes should be 1001.234577890123456" );
	a += 1000.0;
	b += 1000.0;
	c = fp64_add( c, fp64_atof("1000") );
	Serial.print("    float a = "); Serial.println( a, 15 );
	Serial.print("   double a = "); Serial.println( b, 15 );
	Serial.print("float64_t c = "); Serial.println( fp64_to_string(c,18,16) );
	Serial.println();


	Serial.println( "Now subtract 1000.00001\nRes should be 1.234567890123456, our starting value" );
	a -= 1000.00001;
	b -= 1000.00001;
	c = fp64_sub( c, fp64_atof("1000.00001") );
	Serial.print("    float a = "); Serial.print( a, 15 ); Serial.print( " diff = " ); Serial.println( a-aa, 15 );
	Serial.print("   double a = "); Serial.print( b, 15 ); Serial.print( " diff = " ); Serial.println( b-bb, 15 );
	Serial.print("float64_t c = "); Serial.print( fp64_to_string(c,17,15) );  Serial.print( " diff = " ); Serial.println( fp64_to_string(fp64_sub(c,cc),17,15) );
	Serial.println();
	
	
	Serial.println("Now with multiplication");
	a = aa;
	b = bb;
	c = cc;

	Serial.println( "Multiply with 1e-1 = 0.00001\nRes should be 0.1234567890123456" );
	a *= 1e-1;
	b *= 1e-1;
	c = fp64_mul( c, fp64_atof("1e-1") );
	Serial.print("    float a = "); Serial.println( a, 15 );
	Serial.print("   double a = "); Serial.println( b, 15 );
	Serial.print("float64_t c = "); Serial.println( fp64_to_string(c,17,15) );
	Serial.println();
	
	Serial.println( "Multiply 1000\nRes should be 123.4567890123456" );
	a *= 1000.0;
	b *= 1000.0;
	c = fp64_mul( c, fp64_atof("1000") );
	Serial.print("    float a = "); Serial.println( a, 15 );
	Serial.print("   double a = "); Serial.println( b, 15 );
	Serial.print("float64_t c = "); Serial.println( fp64_to_string(c,18,16) );
	Serial.println();


	Serial.println( "Now divide by 100\nRes should be 1.234567890123456, our starting value" );
	a /= 100;
	b /= 100;
	c = fp64_div( c, fp64_atof("100") );
	Serial.print("    float a = "); Serial.print( a, 15 ); Serial.print( " diff = " ); Serial.println( a-aa, 15 );
	Serial.print("   double a = "); Serial.print( b, 15 ); Serial.print( " diff = " ); Serial.println( b-bb, 15 );
	Serial.print("float64_t c = "); Serial.print( fp64_to_string(c,17,15) );  Serial.print( " diff = " ); Serial.println( fp64_to_string(fp64_sub(c,cc),17,15) );
	Serial.println();
	
}

void loop() {
}

Here is the commented output:

double is the same as float
sizeof(float) = 4
sizeof(double) = 4
sizeof(float64_t) = 8

Showing that double variable uses the same amount of memory as float variables, i.e. 4 bytes = 32-bit single precision. fp64lib uses 8 bytes or 64 bits.

Var should be 1.234567890123456
float a = 1.234567880630493
double a = 1.234567880630493
float64_t c = 1.234567890123456

Already at compile time, variables are truncated to 32-bit single precision format, as all the digits specified cannot be stored in a doube or float variable. The value is only precise for a total of 7 decimal digits (8 digits in total), everything after the first “8” is wrong.

Add 1e-5 = 0.00001
Res should be 1.234557890123456
float a = 1.234577894210815
double a = 1.234577894210815
float64_t c = 1.234577890123456

Looks like we “magically” recovered precision, as the result is now precise to a total of 9 digits. However, most the digits after the first “8” changed the value, so the real difference to the inital value is 1.0013580322e-5 instead of 1e-5.

Add 1000
Res should be 1001.234577890123456
float a = 1001.234558105468750
double a = 1001.234558105468750
float64_t c = 1001.2345778901234

And already the magically gained precision is gone now. double/float resuls are only valid for 4 decimal digits (8 digits in total).

Now subtract 1000.00001
Res should be 1.234567890123456, our starting value
float a = 1.234558105468750 diff = -0.000009775161743
double a = 1.234558105468750 diff = -0.000009775161743
float64_t c = 1.234567890123458 diff = 0.000000000000002

And already after 3 simple operations, we got quite some difference to our starting value, we lost 3 digits of precision, the result is only valid for 4 decimal digits (5 digits total).

Now with multiplication
Multiply with 1e-1 = 0.00001
Res should be 0.1234567890123456
float a = 0.123456788063049
double a = 0.123456788063049
float64_t c = 0.123456789012346

Multiplication starts similarely, result is still valid to 8 decimal digits, which is also 8 digits in total.

Multiply 1000
Res should be 123.4567890123456
float a = 123.456794738769531
double a = 123.456794738769531
float64_t c = 123.45678901234561

After the second operation, we lost 1 digit of total precision, now down to 7 digits, 4 decimal digits.

Now divide by 100
Res should be 1.234567890123456, our starting value
float a = 1.234567999839782 diff = 0.000000119209289
double a = 1.234567999839782 diff = 0.000000119209289
float64_t c = 1.234567890123456 diff = 0

In this example, we stay at 7 total digits precision, 6 decimal digits. If we assume, you would have used this for calculations on a digital watch, where the time is stored as HH.MMSSmmmm, already after 3 operations your precision would have degraded to HH.MMSSm, i.e. precise only to 1/10s of a second, no longer down to millisecond level.

Both results fit into my experiences. Even if the data format is able to store valus with about 9 total digits precision, when performing operations, the usable precision is about 5-6 digits – without edge cases.