s-monk
diff --git a/‎doc/src/sgml/catalogs.sgml
Lines changed: 3 additions & 3 deletions b/‎doc/src/sgml/catalogs.sgml
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/backend/commands/analyze.c
Lines changed: 11 additions & 5 deletions b/‎src/backend/commands/analyze.c
Lines changed: 11 additions & 5 deletions
diff --git a/‎src/backend/tsearch/ts_typanalyze.c
Lines changed: 1 addition & 1 deletion b/‎src/backend/tsearch/ts_typanalyze.c
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/backend/utils/adt/rangetypes_typanalyze.c
Lines changed: 3 additions & 1 deletion b/‎src/backend/utils/adt/rangetypes_typanalyze.c
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/backend/utils/adt/selfuncs.c
Lines changed: 8 additions & 4 deletions b/‎src/backend/utils/adt/selfuncs.c
Lines changed: 8 additions & 4 deletions
diff --git a/‎src/include/catalog/pg_statistic.h
Lines changed: 8 additions & 7 deletions b/‎src/include/catalog/pg_statistic.h
Lines changed: 8 additions & 7 deletions
@@ -5814,9 +5814,9 @@
       <entry>The number of distinct nonnull data values in the column.
       A value greater than zero is the actual number of distinct values.
       A value less than zero is the negative of a multiplier for the number
-      of rows in the table; for example, a column in which values appear about
-      twice on the average could be represented by
-      <structfield>stadistinct</> = -0.5.
+      of rows in the table; for example, a column in which about 80% of the
+      values are nonnull and each nonnull value appears about twice on
+      average could be represented by <structfield>stadistinct</> = -0.4.
       A zero value means the number of distinct values is unknown.
       </entry>
      </row>
 
@@ -2102,8 +2102,11 @@ compute_minimal_stats(VacAttrStatsP stats,
 
 		if (nmultiple == 0)
 		{
-			/* If we found no repeated values, assume it's a unique column */
-			stats->stadistinct = -1.0;
+			/*
+			 * If we found no repeated non-null values, assume it's a unique
+			 * column; but be sure to discount for any nulls we found.
+			 */
+			stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
 		}
 		else if (track_cnt < track_max && toowide_cnt == 0 &&
 				 nmultiple == track_cnt)
@@ -2450,8 +2453,11 @@ compute_scalar_stats(VacAttrStatsP stats,
 
 		if (nmultiple == 0)
 		{
-			/* If we found no repeated values, assume it's a unique column */
-			stats->stadistinct = -1.0;
+			/*
+			 * If we found no repeated non-null values, assume it's a unique
+			 * column; but be sure to discount for any nulls we found.
+			 */
+			stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
 		}
 		else if (toowide_cnt == 0 && nmultiple == ndistinct)
 		{
@@ -2755,7 +2761,7 @@ compute_scalar_stats(VacAttrStatsP stats,
 		else
 			stats->stawidth = stats->attrtype->typlen;
 		/* Assume all too-wide values are distinct, so it's a unique column */
-		stats->stadistinct = -1.0;
+		stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
 	}
 	else if (null_cnt > 0)
 	{
 
@@ -295,7 +295,7 @@ compute_tsvector_stats(VacAttrStats *stats,
 		stats->stawidth = total_width / (double) nonnull_cnt;
 
 		/* Assume it's a unique column (see notes above) */
-		stats->stadistinct = -1.0;
+		stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
 
 		/*
 		 * Construct an array of the interesting hashtable items, that is,
 
@@ -203,7 +203,9 @@ compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
 		/* Do the simple null-frac and width stats */
 		stats->stanullfrac = (double) null_cnt / (double) samplerows;
 		stats->stawidth = total_width / (double) non_null_cnt;
-		stats->stadistinct = -1.0;
+
+		/* Estimate that non-null values are unique */
+		stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
 
 		/* Must copy the target values into anl_context */
 		old_cxt = MemoryContextSwitchTo(stats->anl_context);
 
@@ -4629,14 +4629,16 @@ double
 get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
 {
 	double		stadistinct;
+	double		stanullfrac = 0.0;
 	double		ntuples;
 
 	*isdefault = false;
 
 	/*
 	 * Determine the stadistinct value to use.  There are cases where we can
 	 * get an estimate even without a pg_statistic entry, or can get a better
-	 * value than is in pg_statistic.
+	 * value than is in pg_statistic.  Grab stanullfrac too if we can find it
+	 * (otherwise, assume no nulls, for lack of any better idea).
 	 */
 	if (HeapTupleIsValid(vardata->statsTuple))
 	{
@@ -4645,6 +4647,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
 
 		stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
 		stadistinct = stats->stadistinct;
+		stanullfrac = stats->stanullfrac;
 	}
 	else if (vardata->vartype == BOOLOID)
 	{
@@ -4668,7 +4671,7 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
 			{
 				case ObjectIdAttributeNumber:
 				case SelfItemPointerAttributeNumber:
-					stadistinct = -1.0; /* unique */
+					stadistinct = -1.0; /* unique (and all non null) */
 					break;
 				case TableOidAttributeNumber:
 					stadistinct = 1.0;	/* only 1 value */
@@ -4690,10 +4693,11 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
 	 * If there is a unique index or DISTINCT clause for the variable, assume
 	 * it is unique no matter what pg_statistic says; the statistics could be
 	 * out of date, or we might have found a partial unique index that proves
-	 * the var is unique for this query.
+	 * the var is unique for this query.  However, we'd better still believe
+	 * the null-fraction statistic.
 	 */
 	if (vardata->isunique)
-		stadistinct = -1.0;
 
 	/*
 	 * If we had an absolute estimate, use that.
 
@@ -57,13 +57,14 @@ CATALOG(pg_statistic,2619) BKI_WITHOUT_OIDS
 	 *		> 0		actual number of distinct values
 	 *		< 0		negative of multiplier for number of rows
 	 * The special negative case allows us to cope with columns that are
-	 * unique (stadistinct = -1) or nearly so (for example, a column in
-	 * which values appear about twice on the average could be represented
-	 * by stadistinct = -0.5).  Because the number-of-rows statistic in
-	 * pg_class may be updated more frequently than pg_statistic is, it's
-	 * important to be able to describe such situations as a multiple of
-	 * the number of rows, rather than a fixed number of distinct values.
-	 * But in other cases a fixed number is correct (eg, a boolean column).
+	 * unique (stadistinct = -1) or nearly so (for example, a column in which
+	 * non-null values appear about twice on the average could be represented
+	 * by stadistinct = -0.5 if there are no nulls, or -0.4 if 20% of the
+	 * column is nulls).  Because the number-of-rows statistic in pg_class may
+	 * be updated more frequently than pg_statistic is, it's important to be
+	 * able to describe such situations as a multiple of the number of rows,
+	 * rather than a fixed number of distinct values.  But in other cases a
+	 * fixed number is correct (eg, a boolean column).
 	 * ----------------
 	 */
 	float4		stadistinct;
Original file line number	Diff line number	Diff line change
`@@ -2102,8 +2102,11 @@ compute_minimal_stats(VacAttrStatsP stats,`
`2102`	`2102`
`2103`	`2103`	`if (nmultiple == 0)`
`2104`	`2104`	`{`
`2105`		`- /* If we found no repeated values, assume it's a unique column */`
`2106`		`- stats->stadistinct = -1.0;`
	`2105`	`+ /*`
	`2106`	`+ * If we found no repeated non-null values, assume it's a unique`
	`2107`	`+ * column; but be sure to discount for any nulls we found.`
	`2108`	`+ */`
	`2109`	`+ stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);`
`2107`	`2110`	`}`
`2108`	`2111`	`else if (track_cnt < track_max && toowide_cnt == 0 &&`
`2109`	`2112`	`nmultiple == track_cnt)`
`@@ -2450,8 +2453,11 @@ compute_scalar_stats(VacAttrStatsP stats,`
`2450`	`2453`
`2451`	`2454`	`if (nmultiple == 0)`
`2452`	`2455`	`{`
`2453`		`- /* If we found no repeated values, assume it's a unique column */`
`2454`		`- stats->stadistinct = -1.0;`
	`2456`	`+ /*`
	`2457`	`+ * If we found no repeated non-null values, assume it's a unique`
	`2458`	`+ * column; but be sure to discount for any nulls we found.`
	`2459`	`+ */`
	`2460`	`+ stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);`
`2455`	`2461`	`}`
`2456`	`2462`	`else if (toowide_cnt == 0 && nmultiple == ndistinct)`
`2457`	`2463`	`{`
`@@ -2755,7 +2761,7 @@ compute_scalar_stats(VacAttrStatsP stats,`
`2755`	`2761`	`else`
`2756`	`2762`	`stats->stawidth = stats->attrtype->typlen;`
`2757`	`2763`	`/* Assume all too-wide values are distinct, so it's a unique column */`
`2758`		`- stats->stadistinct = -1.0;`
	`2764`	`+ stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);`
`2759`	`2765`	`}`
`2760`	`2766`	`else if (null_cnt > 0)`
`2761`	`2767`	`{`