This example program will show you, that using double on the MegaAVR microcontroller will not provide you any additional precision. Double and float are the same and only offer 32-bit single precision mathematics.
This program is not an edge case, I tried not to create an example where the built-in float/double math will fail enormously. It’s more an everyday’s calculation, e.g. where you have to handle results from a GPS sensor or where you are doing some calendar/clock calculations, dealing both with months, days but also seconds and maybe even milliseconds. A year has already more than 31.5 million seconds, that is already 7 orders of magnitude.
/* Copyright (c) 2019-2025 Uwe Bissinger */
#include <fp64lib.h>
void setup() {
Serial.begin(57600);
volatile float aa, a;
volatile double bb, b;
volatile float64_t cc, c;
a = aa = 1.234567890123456;
b = bb = 1.234567890123456;
c = cc = fp64_atof("1.234567890123456" );
Serial.println( "double is the same as float" );
Serial.print( " sizeof(float) = " ); Serial.println( sizeof(float) );
Serial.print( " sizeof(double) = " ); Serial.println( sizeof(double) );
Serial.print( "sizeof(float64_t) = " ); Serial.println( sizeof(float64_t) );
Serial.println();
Serial.println( "Var should be 1.234567890123456" );
Serial.print(" float a = "); Serial.println( a, 15 );
Serial.print(" double a = "); Serial.println( b, 15 );
Serial.print("float64_t c = "); Serial.println( fp64_to_string(c,17,15) );
Serial.println();
Serial.println( "Add 1e-5 = 0.00001\nRes should be 1.234577890123456" );
a += 1e-5;
b += 1e-5;
c = fp64_add( c, fp64_atof("1e-5") );
Serial.print(" float a = "); Serial.println( a, 15 );
Serial.print(" double a = "); Serial.println( b, 15 );
Serial.print("float64_t c = "); Serial.println( fp64_to_string(c,17,15) );
Serial.println();
Serial.println( "Add 1000\nRes should be 1001.234577890123456" );
a += 1000.0;
b += 1000.0;
c = fp64_add( c, fp64_atof("1000") );
Serial.print(" float a = "); Serial.println( a, 15 );
Serial.print(" double a = "); Serial.println( b, 15 );
Serial.print("float64_t c = "); Serial.println( fp64_to_string(c,18,16) );
Serial.println();
Serial.println( "Now subtract 1000.00001\nRes should be 1.234567890123456, our starting value" );
a -= 1000.00001;
b -= 1000.00001;
c = fp64_sub( c, fp64_atof("1000.00001") );
Serial.print(" float a = "); Serial.print( a, 15 ); Serial.print( " diff = " ); Serial.println( a-aa, 15 );
Serial.print(" double a = "); Serial.print( b, 15 ); Serial.print( " diff = " ); Serial.println( b-bb, 15 );
Serial.print("float64_t c = "); Serial.print( fp64_to_string(c,17,15) ); Serial.print( " diff = " ); Serial.println( fp64_to_string(fp64_sub(c,cc),17,15) );
Serial.println();
Serial.println("Now with multiplication");
a = aa;
b = bb;
c = cc;
Serial.println( "Multiply with 1e-1 = 0.00001\nRes should be 0.1234567890123456" );
a *= 1e-1;
b *= 1e-1;
c = fp64_mul( c, fp64_atof("1e-1") );
Serial.print(" float a = "); Serial.println( a, 15 );
Serial.print(" double a = "); Serial.println( b, 15 );
Serial.print("float64_t c = "); Serial.println( fp64_to_string(c,17,15) );
Serial.println();
Serial.println( "Multiply 1000\nRes should be 123.4567890123456" );
a *= 1000.0;
b *= 1000.0;
c = fp64_mul( c, fp64_atof("1000") );
Serial.print(" float a = "); Serial.println( a, 15 );
Serial.print(" double a = "); Serial.println( b, 15 );
Serial.print("float64_t c = "); Serial.println( fp64_to_string(c,18,16) );
Serial.println();
Serial.println( "Now divide by 100\nRes should be 1.234567890123456, our starting value" );
a /= 100;
b /= 100;
c = fp64_div( c, fp64_atof("100") );
Serial.print(" float a = "); Serial.print( a, 15 ); Serial.print( " diff = " ); Serial.println( a-aa, 15 );
Serial.print(" double a = "); Serial.print( b, 15 ); Serial.print( " diff = " ); Serial.println( b-bb, 15 );
Serial.print("float64_t c = "); Serial.print( fp64_to_string(c,17,15) ); Serial.print( " diff = " ); Serial.println( fp64_to_string(fp64_sub(c,cc),17,15) );
Serial.println();
}
void loop() {
}
Here is the commented output:
double is the same as float
sizeof(float) = 4
sizeof(double) = 4
sizeof(float64_t) = 8
Showing that double variable uses the same amount of memory as float variables, i.e. 4 bytes = 32-bit single precision. fp64lib uses 8 bytes or 64 bits.
Var should be 1.234567890123456
float a = 1.234567880630493
double a = 1.234567880630493
float64_t c = 1.234567890123456
Already at compile time, variables are truncated to 32-bit single precision format, as all the digits specified cannot be stored in a doube or float variable. The value is only precise for a total of 7 decimal digits (8 digits in total), everything after the first “8” is wrong.
Add 1e-5 = 0.00001
Res should be 1.234557890123456
float a = 1.234577894210815
double a = 1.234577894210815
float64_t c = 1.234577890123456
Looks like we “magically” recovered precision, as the result is now precise to a total of 9 digits. However, most the digits after the first “8” changed the value, so the real difference to the inital value is 1.0013580322e-5 instead of 1e-5.
Add 1000
Res should be 1001.234577890123456
float a = 1001.234558105468750
double a = 1001.234558105468750
float64_t c = 1001.2345778901234
And already the magically gained precision is gone now. double/float resuls are only valid for 4 decimal digits (8 digits in total).
Now subtract 1000.00001
Res should be 1.234567890123456, our starting value
float a = 1.234558105468750 diff = -0.000009775161743
double a = 1.234558105468750 diff = -0.000009775161743
float64_t c = 1.234567890123458 diff = 0.000000000000002
And already after 3 simple operations, we got quite some difference to our starting value, we lost 3 digits of precision, the result is only valid for 4 decimal digits (5 digits total).
Now with multiplication
Multiply with 1e-1 = 0.00001
Res should be 0.1234567890123456
float a = 0.123456788063049
double a = 0.123456788063049
float64_t c = 0.123456789012346
Multiplication starts similarely, result is still valid to 8 decimal digits, which is also 8 digits in total.
Multiply 1000
Res should be 123.4567890123456
float a = 123.456794738769531
double a = 123.456794738769531
float64_t c = 123.45678901234561
After the second operation, we lost 1 digit of total precision, now down to 7 digits, 4 decimal digits.
Now divide by 100
Res should be 1.234567890123456, our starting value
float a = 1.234567999839782 diff = 0.000000119209289
double a = 1.234567999839782 diff = 0.000000119209289
float64_t c = 1.234567890123456 diff = 0
In this example, we stay at 7 total digits precision, 6 decimal digits. If we assume, you would have used this for calculations on a digital watch, where the time is stored as HH.MMSSmmmm, already after 3 operations your precision would have degraded to HH.MMSSm, i.e. precise only to 1/10s of a second, no longer down to millisecond level.
Both results fit into my experiences. Even if the data format is able to store valus with about 9 total digits precision, when performing operations, the usable precision is about 5-6 digits – without edge cases.