|
568 | 568 | " df.columns = df.columns.str.replace(' ', '')\n",
|
569 | 569 | " df.columns = df.columns.str.replace('-', '_')\n",
|
570 | 570 | " df = df.drop(['Sex_Male'], axis=1)\n",
|
571 |
| - " df = pd.concat([df, cat_vals], axis=1).drop('index', axis=1)\n", |
| 571 | + " if 'index' in df.columns or 'index' in cat_vals.columns:\n", |
| 572 | + " df = pd.concat([df, cat_vals], axis=1).drop('index', axis=1)\n", |
572 | 573 | " # For the model to score correctly, all OHE columns must exist\n",
|
573 | 574 | " input_cols = [\n",
|
574 | 575 | " \"Education_9th\", \"Education_10th\", \"Education_11th\", \"Education_12th\", \"Education_Assoc_voc\", \"Education_Assoc_acdm\", \"Education_Masters\", \"Education_Prof_school\",\n",
|
|
579 | 580 | " 'Relationship_Not_in_family', 'Relationship_Own_child', 'Relationship_Unmarried', 'Relationship_Wife', 'Relationship_Other_relative', 'WorkClass_Private',\n",
|
580 | 581 | " 'Education_Bachelors'\n",
|
581 | 582 | " ]\n",
|
| 583 | + " # OHE columns must be removed after data combination\n", |
| 584 | + " predictor_columns = ['Age', 'HoursPerWeek', 'WorkClass_Private', 'WorkClass_Self', 'WorkClass_Gov', \n", |
| 585 | + " 'WorkClass_Other', 'Education_HS_grad', 'Education_Some_HS', 'Education_Assoc', 'Education_Some_college',\n", |
| 586 | + " 'Education_Bachelors', 'Education_Adv_Degree', 'Education_No_HS', 'MartialStatus_Married_civ_spouse',\n", |
| 587 | + " 'MartialStatus_Never_married', 'MartialStatus_Divorced', 'MartialStatus_Separated', 'MartialStatus_Widowed',\n", |
| 588 | + " 'MartialStatus_Other', 'Relationship_Husband', 'Relationship_Not_in_family', 'Relationship_Own_child', 'Relationship_Unmarried',\n", |
| 589 | + " 'Relationship_Wife', 'Relationship_Other_relative', 'Race_White', 'Race_Black', 'Race_Asian_Pac_Islander',\n", |
| 590 | + " 'Race_Amer_Indian_Eskimo', 'Race_Other', 'Sex_Female']\n", |
| 591 | + "\n", |
582 | 592 | " for col in input_cols:\n",
|
583 | 593 | " if col not in df.columns:\n",
|
584 | 594 | " df[col] = 0\n",
|
| 595 | + " \n", |
| 596 | + "\n", |
585 | 597 | " df[\"Education_Some_HS\"] = df[\"Education_9th\"] | df[\"Education_10th\"] | df[\"Education_11th\"] | df[\"Education_12th\"]\n",
|
586 | 598 | " df[\"Education_Assoc\"] = df[\"Education_Assoc_voc\"] | df[\"Education_Assoc_acdm\"]\n",
|
587 | 599 | " df[\"Education_Adv_Degree\"] = df[\"Education_Masters\"] | df[\"Education_Prof_school\"] | df[\"Education_Doctorate\"]\n",
|
|
593 | 605 | "\n",
|
594 | 606 | " df[\"MartialStatus_Other\"] = df[\"MartialStatus_Married_spouse_absent\"] | df[\"MartialStatus_Married_AF_spouse\"]\n",
|
595 | 607 | "\n",
|
| 608 | + " df = df[predictor_columns]\n", |
| 609 | + "\n", |
596 | 610 | " return df"
|
597 | 611 | ]
|
598 | 612 | },
|
|
1772 | 1786 | ],
|
1773 | 1787 | "metadata": {
|
1774 | 1788 | "kernelspec": {
|
1775 |
| - "display_name": "Python 3", |
| 1789 | + "display_name": "pandatwo", |
1776 | 1790 | "language": "python",
|
1777 | 1791 | "name": "python3"
|
1778 | 1792 | },
|
|
0 commit comments