{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "machine_shape": "hm", "gpuType": "L4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "17f8d0b0ef4347fd81984c46fbb9e684": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_7d78e730875146efa1dd35b906344678", "IPY_MODEL_dc1906aa05194549a4d6dcc85d92dc09", "IPY_MODEL_1f6c2b5fc10a4a98a5380a13fa70bc75" ], "layout": "IPY_MODEL_dba61edae6884c92bdf9a3d76dec59c9" } }, "7d78e730875146efa1dd35b906344678": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_99a31b328dbf401c9db2b340cb2105d5", "placeholder": "​", "style": "IPY_MODEL_76efebd872c044b9a7f8c38d57eac49f", "value": "Epoch 1: 100%" } }, "dc1906aa05194549a4d6dcc85d92dc09": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5af975c1370f43d59b97ed336c866e0c", "max": 935, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_4a035406ac194a67afca19b313e05b9a", "value": 935 } }, "1f6c2b5fc10a4a98a5380a13fa70bc75": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c429a347002a43fe8a0a57f5797767d7", "placeholder": "​", "style": "IPY_MODEL_cce3e0efd2bb4b978c6fa150b6cac602", "value": " 935/935 [05:06<00:00,  3.30it/s, acc=21.8%]" } }, "dba61edae6884c92bdf9a3d76dec59c9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "99a31b328dbf401c9db2b340cb2105d5": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "76efebd872c044b9a7f8c38d57eac49f": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "5af975c1370f43d59b97ed336c866e0c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4a035406ac194a67afca19b313e05b9a": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "c429a347002a43fe8a0a57f5797767d7": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "cce3e0efd2bb4b978c6fa150b6cac602": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "da18291cbf374325ac676de844a353e7": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_4e910a07266a40caabcc33eeb941bdc0", "IPY_MODEL_9c602bb5011d48c8b04404f6b3a37fab", "IPY_MODEL_a87e1520f5254be18114a5f782e4e234" ], "layout": "IPY_MODEL_fd027301b7a6496eb8ae039c25bc4d4b" } }, "4e910a07266a40caabcc33eeb941bdc0": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_9b3468290ae84339ac7c8ce2a9976741", "placeholder": "​", "style": "IPY_MODEL_8beb040b66a1413abd534d5d455d71e2", "value": "Validating: 100%" } }, "9c602bb5011d48c8b04404f6b3a37fab": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b0fc7aa19e194e91a3138867b2377db9", "max": 165, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_e9a139d721a147d58952f4a1efc0401b", "value": 165 } }, "a87e1520f5254be18114a5f782e4e234": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b2c27f1f8a5f437e8f359f1e0ea05fbd", "placeholder": "​", "style": "IPY_MODEL_1f697938d7e845bb9adcbb3015606f70", "value": " 165/165 [00:52<00:00,  3.17it/s]" } }, "fd027301b7a6496eb8ae039c25bc4d4b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9b3468290ae84339ac7c8ce2a9976741": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "8beb040b66a1413abd534d5d455d71e2": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b0fc7aa19e194e91a3138867b2377db9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e9a139d721a147d58952f4a1efc0401b": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "b2c27f1f8a5f437e8f359f1e0ea05fbd": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "1f697938d7e845bb9adcbb3015606f70": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "7909e231d23b447c9a0975ed72e00f60": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_654bf91d7d694141aa9d3382e72439cb", "IPY_MODEL_ddbba23be5cd42d8a12669ae9ae7f0fc", "IPY_MODEL_94201956b5774585b99637bf2157f1b0" ], "layout": "IPY_MODEL_0d3995008ff149828c81d2b91f18be0b" } }, "654bf91d7d694141aa9d3382e72439cb": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a3f9669d01d241e99b7a7bdd98792353", "placeholder": "​", "style": "IPY_MODEL_e361562b232c49e6b4771a10106d0d4d", "value": "Epoch 2: 100%" } }, "ddbba23be5cd42d8a12669ae9ae7f0fc": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b0b5f7a2436d4d51b0760b1162f219db", "max": 935, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_4ceb35fd88a242218cb4992f631fe68c", "value": 935 } }, "94201956b5774585b99637bf2157f1b0": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_faca4620999e4480823607b9a673e570", "placeholder": "​", "style": "IPY_MODEL_b2548dde84694f589b108a28ca34c4f2", "value": " 935/935 [05:08<00:00,  2.73it/s, acc=28.8%]" } }, "0d3995008ff149828c81d2b91f18be0b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a3f9669d01d241e99b7a7bdd98792353": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e361562b232c49e6b4771a10106d0d4d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b0b5f7a2436d4d51b0760b1162f219db": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4ceb35fd88a242218cb4992f631fe68c": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "faca4620999e4480823607b9a673e570": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b2548dde84694f589b108a28ca34c4f2": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "990d32fd8ad943898db8f524175171ba": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_56543f9f03674385a106f46b3736a270", "IPY_MODEL_0856e831cbb7419fbbe66e7e7fb99906", "IPY_MODEL_f4f091fd560242a28cf09b6c33d7c9a3" ], "layout": "IPY_MODEL_7421731362ea4c25abceefb9cf2c5d1b" } }, "56543f9f03674385a106f46b3736a270": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_26e92b3e139a464a848b68a55a8095a8", "placeholder": "​", "style": "IPY_MODEL_75cf349e98bd4a6bb45a66a112acfa30", "value": "Validating: 100%" } }, "0856e831cbb7419fbbe66e7e7fb99906": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_abbb218edf3a4f54bcd72acecfdd17b3", "max": 165, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_72bc8f82e228452fb5faa36ff84d03a4", "value": 165 } }, "f4f091fd560242a28cf09b6c33d7c9a3": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_bbee56a1661c4347bc538ad204e4efe2", "placeholder": "​", "style": "IPY_MODEL_05669dee14ff4356a8185285ee9c2951", "value": " 165/165 [00:52<00:00,  3.18it/s]" } }, "7421731362ea4c25abceefb9cf2c5d1b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "26e92b3e139a464a848b68a55a8095a8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "75cf349e98bd4a6bb45a66a112acfa30": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "abbb218edf3a4f54bcd72acecfdd17b3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "72bc8f82e228452fb5faa36ff84d03a4": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "bbee56a1661c4347bc538ad204e4efe2": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "05669dee14ff4356a8185285ee9c2951": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a1dbd51ede52442e8b6b2cef26e8bbd9": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_b62503ae035d447ab609dff304523785", "IPY_MODEL_d98feea8dca847a1a087c19e0ffb40db", "IPY_MODEL_903bdef3de794bd0a0978a9fa847dcc8" ], "layout": "IPY_MODEL_81acd630739a4a99a33bc5c060b56c0b" } }, "b62503ae035d447ab609dff304523785": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_3f1848c901e14b6d86f9e10a52ca607e", "placeholder": "​", "style": "IPY_MODEL_b47f59b606b94a68a3361a185e0040aa", "value": "Epoch 3:  93%" } }, "d98feea8dca847a1a087c19e0ffb40db": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5f47532448504e4c8398688d85787201", "max": 935, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_0e8901c365954716932d4a09bb5e615f", "value": 867 } }, "903bdef3de794bd0a0978a9fa847dcc8": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7527f720a46a4ed384f09866b6550e6e", "placeholder": "​", "style": "IPY_MODEL_d5df03c93765465fb8bc49dc19c88786", "value": " 867/935 [04:44<00:23,  2.91it/s, acc=33.5%]" } }, "81acd630739a4a99a33bc5c060b56c0b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3f1848c901e14b6d86f9e10a52ca607e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b47f59b606b94a68a3361a185e0040aa": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "5f47532448504e4c8398688d85787201": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0e8901c365954716932d4a09bb5e615f": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "7527f720a46a4ed384f09866b6550e6e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d5df03c93765465fb8bc49dc19c88786": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "source": [ "try:\n", " !pip uninstall -qy geometricvocab geofractal\n", "except:\n", " pass\n", "\n", "!pip install -q git+https://github.com/AbstractEyes/geofractal.git" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LhqMG1Ayd6W6", "outputId": "26f76fc9-8243-4029-db43-aba0a125e027" }, "execution_count": 21, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for geofractal (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for geometricvocab (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n" ] } ] }, { "cell_type": "code", "source": [ "# Cell: Setup Qwen2.5-Math-1.5B + T5 Hierarchical Collective\n", "import torch\n", "import torch.nn as nn\n", "from transformers import AutoTokenizer, AutoModel, T5Tokenizer, T5EncoderModel\n", "from datasets import load_dataset\n", "from torch.utils.data import DataLoader\n", "from geofractal.router.head import build_standard_head, HeadConfig\n", "from tqdm.auto import tqdm\n", "import re\n", "\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "print(f\"Device: {device}\")\n", "\n", "# ============================================================================\n", "# FROZEN BACKBONES\n", "# ============================================================================\n", "\n", "# Qwen2.5-Math-1.5B - actual math reasoning model\n", "qwen_tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen2.5-Math-1.5B\", trust_remote_code=True)\n", "qwen = AutoModel.from_pretrained(\"Qwen/Qwen2.5-Math-1.5B\", trust_remote_code=True).to(device)\n", "qwen.eval()\n", "for p in qwen.parameters():\n", " p.requires_grad = False\n", "\n", "# T5-base - general language\n", "t5_tokenizer = T5Tokenizer.from_pretrained(\"t5-base\")\n", "t5 = T5EncoderModel.from_pretrained(\"t5-base\").to(device)\n", "t5.eval()\n", "for p in t5.parameters():\n", " p.requires_grad = False\n", "\n", "print(f\"Qwen2.5-Math hidden: {qwen.config.hidden_size}\")\n", "print(f\"T5 hidden: {t5.config.d_model}\")\n", "print(f\"Qwen params: {sum(p.numel() for p in qwen.parameters()):,} (frozen)\")\n", "print(f\"T5 params: {sum(p.numel() for p in t5.parameters()):,} (frozen)\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_zXY4Iktog6n", "outputId": "8d2e1538-f154-4699-851e-f2c7ced877f1" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Device: cuda\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Qwen2.5-Math hidden: 1536\n", "T5 hidden: 768\n", "Qwen params: 1,543,714,304 (frozen)\n", "T5 params: 109,628,544 (frozen)\n" ] } ] }, { "cell_type": "code", "source": [ "# Cell: Hierarchical routing structure\n", "#\n", "# Level 1: Base streams (T5 linguistic + Deterministic)\n", "# Level 2: Qwen-Math as supervisor/verifier\n", "#\n", "# The hierarchy: Qwen sees the fused lower-level representation\n", "# and can override/refine it\n", "\n", "ROUTE_DIM = 512\n", "QWEN_DIM = qwen.config.hidden_size # 1536 for 1.5B\n", "\n", "# === LEVEL 1: Base Streams ===\n", "\n", "# Projections\n", "proj_t5 = nn.Linear(768, ROUTE_DIM).to(device)\n", "\n", "# Deterministic streams (simplified - keep best performers)\n", "class SymbolicCalculatorStream(nn.Module):\n", " def __init__(self, output_dim):\n", " super().__init__()\n", " self.num_features = 12\n", " self.projection = nn.Linear(self.num_features, output_dim)\n", "\n", " def extract_numbers(self, text):\n", " pattern = r'-?\\d+\\.?\\d*'\n", " matches = re.findall(pattern, text)\n", " return [float(m) for m in matches if m not in ['-', '.']][:10]\n", "\n", " def compute_features(self, numbers):\n", " if len(numbers) == 0:\n", " return torch.zeros(self.num_features)\n", " t = torch.tensor(numbers, dtype=torch.float32)\n", " return torch.tensor([\n", " len(numbers), t.sum().item(),\n", " t.prod().item() if len(t) < 10 else 0,\n", " t.mean().item(), t.std().item() if len(t) > 1 else 0,\n", " t.min().item(), t.max().item(), (t.max() - t.min()).item(),\n", " (t > 0).sum().item(), (t < 0).sum().item(),\n", " (t == 0).sum().item(), t.abs().mean().item(),\n", " ], dtype=torch.float32)\n", "\n", " def forward(self, texts, seq_len):\n", " features = torch.stack([self.compute_features(self.extract_numbers(t)) for t in texts])\n", " features = features.to(next(self.parameters()).device)\n", " return self.projection(features.unsqueeze(1).expand(-1, seq_len, -1))\n", "\n", "symbolic_stream = SymbolicCalculatorStream(output_dim=ROUTE_DIM).to(device)\n", "\n", "# Level 1 heads\n", "head_config = HeadConfig(feature_dim=ROUTE_DIM, fingerprint_dim=64, num_anchors=16, num_routes=4)\n", "head_t5 = build_standard_head(head_config).to(device)\n", "head_symbolic = build_standard_head(head_config).to(device)\n", "\n", "# Level 1 fusion\n", "fusion_l1 = nn.Sequential(\n", " nn.Linear(ROUTE_DIM * 2, ROUTE_DIM),\n", " nn.LayerNorm(ROUTE_DIM),\n", " nn.GELU(),\n", ").to(device)\n", "\n", "# === LEVEL 2: Qwen-Math Supervisor ===\n", "\n", "proj_qwen = nn.Linear(QWEN_DIM, ROUTE_DIM).to(device)\n", "head_qwen = build_standard_head(head_config).to(device)\n", "\n", "# Level 2 receives: Qwen features + Level 1 fused features\n", "fusion_l2 = nn.Sequential(\n", " nn.Linear(ROUTE_DIM * 2, ROUTE_DIM),\n", " nn.LayerNorm(ROUTE_DIM),\n", " nn.GELU(),\n", " nn.Dropout(0.1),\n", ").to(device)\n", "\n", "# === OUTPUT ===\n", "NUM_BUCKETS = 10 # Reduced from 20 for easier learning\n", "\n", "classifier = nn.Sequential(\n", " nn.Linear(ROUTE_DIM, 256),\n", " nn.GELU(),\n", " nn.Dropout(0.1),\n", " nn.Linear(256, NUM_BUCKETS),\n", ").to(device)\n", "\n", "# Individual classifiers for emergence tracking\n", "classifier_qwen = nn.Linear(ROUTE_DIM, NUM_BUCKETS).to(device)\n", "classifier_t5 = nn.Linear(ROUTE_DIM, NUM_BUCKETS).to(device)\n", "classifier_symbolic = nn.Linear(ROUTE_DIM, NUM_BUCKETS).to(device)\n", "classifier_l1 = nn.Linear(ROUTE_DIM, NUM_BUCKETS).to(device) # Level 1 combined\n", "\n", "print(\"✓ Hierarchical architecture defined\")\n", "print(f\" Level 1: T5 + Symbolic → fusion_l1\")\n", "print(f\" Level 2: Qwen-Math + L1 → fusion_l2 → classifier\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "evj4Dmk8onVR", "outputId": "ca07d4fc-5493-440a-b455-19cfecfffe81" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "✓ Hierarchical architecture defined\n", " Level 1: T5 + Symbolic → fusion_l1\n", " Level 2: Qwen-Math + L1 → fusion_l2 → classifier\n" ] } ] }, { "cell_type": "code", "source": [ "# Cell: Collect trainable params\n", "trainable = (\n", " list(proj_t5.parameters()) + list(proj_qwen.parameters()) +\n", " list(symbolic_stream.parameters()) +\n", " list(head_t5.parameters()) + list(head_symbolic.parameters()) + list(head_qwen.parameters()) +\n", " list(fusion_l1.parameters()) + list(fusion_l2.parameters()) +\n", " list(classifier.parameters()) +\n", " list(classifier_qwen.parameters()) + list(classifier_t5.parameters()) +\n", " list(classifier_symbolic.parameters()) + list(classifier_l1.parameters())\n", ")\n", "\n", "optimizer = torch.optim.AdamW(trainable, lr=2e-4, weight_decay=0.01)\n", "criterion = nn.CrossEntropyLoss()\n", "\n", "print(f\"\\n=== HIERARCHICAL MATH COLLECTIVE ===\")\n", "print(f\"Frozen: Qwen2.5-Math-1.5B ({sum(p.numel() for p in qwen.parameters()):,}) + T5 ({sum(p.numel() for p in t5.parameters()):,})\")\n", "print(f\"Trainable: {sum(p.numel() for p in trainable):,}\")\n", "print(f\"Output: {NUM_BUCKETS} answer buckets\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "k9ET-pYfokWU", "outputId": "fe494679-c66e-45ad-a2f6-f639ab3cb9e9" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "=== HIERARCHICAL MATH COLLECTIVE ===\n", "Frozen: Qwen2.5-Math-1.5B (1,543,714,304) + T5 (109,628,544)\n", "Trainable: 13,686,822\n", "Output: 10 answer buckets\n" ] } ] }, { "cell_type": "code", "source": [ "# Cell: Forward pass\n", "# Cell: Fixed forward with safe last-token pooling\n", "def forward_hierarchical(questions):\n", " B = len(questions)\n", "\n", " # === ENCODE ===\n", " enc_qwen = qwen_tokenizer(questions, return_tensors=\"pt\",\n", " padding=True, truncation=True, max_length=256)\n", " with torch.no_grad():\n", " hidden_qwen = qwen(\n", " enc_qwen.input_ids.to(device),\n", " attention_mask=enc_qwen.attention_mask.to(device)\n", " ).last_hidden_state\n", "\n", " enc_t5 = t5_tokenizer(questions, return_tensors=\"pt\",\n", " padding=True, truncation=True, max_length=256)\n", " with torch.no_grad():\n", " hidden_t5 = t5(\n", " enc_t5.input_ids.to(device),\n", " attention_mask=enc_t5.attention_mask.to(device)\n", " ).last_hidden_state\n", "\n", " S = min(hidden_qwen.shape[1], hidden_t5.shape[1])\n", " hidden_qwen = hidden_qwen[:, :S, :]\n", " hidden_t5 = hidden_t5[:, :S, :]\n", "\n", " # === LEVEL 1 ===\n", " proj_t = proj_t5(hidden_t5)\n", " symbolic_feat = symbolic_stream(questions, S)\n", "\n", " routed_t5 = head_t5(proj_t)\n", " routed_symbolic = head_symbolic(symbolic_feat)\n", "\n", " pooled_t5 = routed_t5[:, 0]\n", " pooled_symbolic = routed_symbolic[:, 0]\n", "\n", " fused_l1 = fusion_l1(torch.cat([pooled_t5, pooled_symbolic], dim=-1))\n", "\n", " # === LEVEL 2 ===\n", " proj_q = proj_qwen(hidden_qwen)\n", " routed_qwen = head_qwen(proj_q)\n", "\n", " # FIX: Safe last-token pooling - clamp to actual sequence length after truncation\n", " seq_lens = enc_qwen.attention_mask[:, :S].sum(dim=1) - 1 # Truncated mask\n", " seq_lens = seq_lens.clamp(min=0, max=S-1).long() # Safety clamp\n", " pooled_qwen = routed_qwen[torch.arange(B, device=device), seq_lens]\n", "\n", " fused_l2 = fusion_l2(torch.cat([pooled_qwen, fused_l1], dim=-1))\n", "\n", " # === CLASSIFY ===\n", " logits = classifier(fused_l2)\n", "\n", " ind_logits = {\n", " 'qwen': classifier_qwen(pooled_qwen),\n", " 't5': classifier_t5(pooled_t5),\n", " 'symbolic': classifier_symbolic(pooled_symbolic),\n", " 'level1': classifier_l1(fused_l1),\n", " }\n", "\n", " return logits, ind_logits\n", "\n", "print(\"✓ Fixed with safe index clamping\")\n", "\n", "print(\"✓ Fixed Qwen pooling to use last token\")\n", "\n", "# Test\n", "test_q = [\"John has 5 apples and buys 3 more. How many apples does John have?\"]\n", "logits, ind = forward_hierarchical(test_q)\n", "print(f\"✓ Forward pass works: {logits.shape}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "w9Shin3FosSP", "outputId": "138c8176-c81a-4f07-bdd2-ee6c9e295ea4" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "✓ Fixed with safe index clamping\n", "✓ Fixed Qwen pooling to use last token\n", "✓ Forward pass works: torch.Size([1, 10])\n" ] } ] }, { "cell_type": "code", "source": [ "# Cell: Load data and setup buckets\n", "dataset = load_dataset(\"openai/gsm8k\", \"main\")\n", "\n", "def extract_final_answer(answer_text):\n", " match = re.search(r'####\\s*(-?\\d+\\.?\\d*)', answer_text)\n", " return float(match.group(1)) if match else None\n", "\n", "answers = [extract_final_answer(ex['answer']) for ex in dataset['train']]\n", "answers = [a for a in answers if a is not None]\n", "\n", "import numpy as np\n", "percentiles = np.percentile(answers, np.linspace(0, 100, NUM_BUCKETS + 1))\n", "print(f\"Answer range: {min(answers)} to {max(answers)}\")\n", "print(f\"{NUM_BUCKETS} buckets\")\n", "\n", "def answer_to_bucket(answer):\n", " for i, (low, high) in enumerate(zip(percentiles[:-1], percentiles[1:])):\n", " if answer <= high:\n", " return i\n", " return NUM_BUCKETS - 1\n", "\n", "def collate_fn(examples):\n", " return {\n", " 'question': [ex['question'] for ex in examples],\n", " 'answer': [ex['answer'] for ex in examples],\n", " }\n", "\n", "train_loader = DataLoader(dataset['train'], batch_size=8, shuffle=True, collate_fn=collate_fn) # Smaller batch for 1.5B model\n", "test_loader = DataLoader(dataset['test'], batch_size=8, shuffle=False, collate_fn=collate_fn)\n", "\n", "print(f\"Train batches: {len(train_loader)}, Test batches: {len(test_loader)}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JCH22yCxovZS", "outputId": "4e19eea8-a54d-4f02-ed0d-24155cb47e5f" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Answer range: -47.0 to 192000000.0\n", "10 buckets\n", "Train batches: 935, Test batches: 165\n" ] } ] }, { "cell_type": "code", "source": [ "# Cell: Training loop\n", "EPOCHS = 5\n", "history = []\n", "\n", "for epoch in range(EPOCHS):\n", " head_t5.train(); head_symbolic.train(); head_qwen.train()\n", " fusion_l1.train(); fusion_l2.train(); classifier.train()\n", "\n", " correct, total = 0, 0\n", " pbar = tqdm(train_loader, desc=f\"Epoch {epoch+1}\")\n", "\n", " for batch in pbar:\n", " questions = batch['question']\n", " answers_text = batch['answer']\n", "\n", " labels = []\n", " for ans in answers_text:\n", " num = extract_final_answer(ans)\n", " labels.append(answer_to_bucket(num) if num else 0)\n", " labels = torch.tensor(labels).to(device)\n", "\n", " optimizer.zero_grad()\n", " logits, ind_logits = forward_hierarchical(questions)\n", "\n", " # Hierarchical loss\n", " loss = criterion(logits, labels) # Final output\n", " loss += 0.1 * criterion(ind_logits['qwen'], labels)\n", " loss += 0.1 * criterion(ind_logits['t5'], labels)\n", " loss += 0.05 * criterion(ind_logits['symbolic'], labels)\n", " loss += 0.1 * criterion(ind_logits['level1'], labels) # Level 1 combined\n", "\n", " loss.backward()\n", " optimizer.step()\n", "\n", " correct += (logits.argmax(-1) == labels).sum().item()\n", " total += labels.size(0)\n", " pbar.set_postfix({'acc': f'{correct/total:.1%}'})\n", "\n", " # Eval\n", " head_t5.eval(); head_symbolic.eval(); head_qwen.eval()\n", " fusion_l1.eval(); fusion_l2.eval(); classifier.eval()\n", "\n", " metrics = {k: 0 for k in ['collective', 'qwen', 't5', 'symbolic', 'level1']}\n", " val_total = 0\n", "\n", " with torch.no_grad():\n", " for batch in tqdm(test_loader, desc=\"Validating\"):\n", " questions = batch['question']\n", " answers_text = batch['answer']\n", "\n", " labels = []\n", " for ans in answers_text:\n", " num = extract_final_answer(ans)\n", " labels.append(answer_to_bucket(num) if num else 0)\n", " labels = torch.tensor(labels).to(device)\n", "\n", " logits, ind_logits = forward_hierarchical(questions)\n", "\n", " metrics['collective'] += (logits.argmax(-1) == labels).sum().item()\n", " for name, ind_log in ind_logits.items():\n", " metrics[name] += (ind_log.argmax(-1) == labels).sum().item()\n", " val_total += labels.size(0)\n", "\n", " accs = {k: v / val_total for k, v in metrics.items()}\n", " max_ind = max(accs['qwen'], accs['t5'], accs['symbolic'])\n", " rho = accs['collective'] / max_ind if max_ind > 0 else 0\n", "\n", " history.append({**accs, 'rho': rho, 'epoch': epoch + 1})\n", "\n", " print(f\"\\nEpoch {epoch+1}:\")\n", " print(f\" Collective: {accs['collective']:.1%} (hierarchical output)\")\n", " print(f\" Qwen-Math: {accs['qwen']:.1%}, T5: {accs['t5']:.1%}, Symbolic: {accs['symbolic']:.1%}\")\n", " print(f\" Level 1 (T5+Sym): {accs['level1']:.1%}\")\n", " print(f\" ρ = {rho:.3f}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 385, "referenced_widgets": [ "17f8d0b0ef4347fd81984c46fbb9e684", "7d78e730875146efa1dd35b906344678", "dc1906aa05194549a4d6dcc85d92dc09", "1f6c2b5fc10a4a98a5380a13fa70bc75", "dba61edae6884c92bdf9a3d76dec59c9", "99a31b328dbf401c9db2b340cb2105d5", "76efebd872c044b9a7f8c38d57eac49f", "5af975c1370f43d59b97ed336c866e0c", "4a035406ac194a67afca19b313e05b9a", "c429a347002a43fe8a0a57f5797767d7", "cce3e0efd2bb4b978c6fa150b6cac602", "da18291cbf374325ac676de844a353e7", "4e910a07266a40caabcc33eeb941bdc0", "9c602bb5011d48c8b04404f6b3a37fab", "a87e1520f5254be18114a5f782e4e234", "fd027301b7a6496eb8ae039c25bc4d4b", "9b3468290ae84339ac7c8ce2a9976741", "8beb040b66a1413abd534d5d455d71e2", "b0fc7aa19e194e91a3138867b2377db9", "e9a139d721a147d58952f4a1efc0401b", "b2c27f1f8a5f437e8f359f1e0ea05fbd", "1f697938d7e845bb9adcbb3015606f70", "7909e231d23b447c9a0975ed72e00f60", "654bf91d7d694141aa9d3382e72439cb", "ddbba23be5cd42d8a12669ae9ae7f0fc", "94201956b5774585b99637bf2157f1b0", "0d3995008ff149828c81d2b91f18be0b", "a3f9669d01d241e99b7a7bdd98792353", "e361562b232c49e6b4771a10106d0d4d", "b0b5f7a2436d4d51b0760b1162f219db", "4ceb35fd88a242218cb4992f631fe68c", "faca4620999e4480823607b9a673e570", "b2548dde84694f589b108a28ca34c4f2", "990d32fd8ad943898db8f524175171ba", "56543f9f03674385a106f46b3736a270", "0856e831cbb7419fbbe66e7e7fb99906", "f4f091fd560242a28cf09b6c33d7c9a3", "7421731362ea4c25abceefb9cf2c5d1b", "26e92b3e139a464a848b68a55a8095a8", "75cf349e98bd4a6bb45a66a112acfa30", "abbb218edf3a4f54bcd72acecfdd17b3", "72bc8f82e228452fb5faa36ff84d03a4", "bbee56a1661c4347bc538ad204e4efe2", "05669dee14ff4356a8185285ee9c2951", "a1dbd51ede52442e8b6b2cef26e8bbd9", "b62503ae035d447ab609dff304523785", "d98feea8dca847a1a087c19e0ffb40db", "903bdef3de794bd0a0978a9fa847dcc8", "81acd630739a4a99a33bc5c060b56c0b", "3f1848c901e14b6d86f9e10a52ca607e", "b47f59b606b94a68a3361a185e0040aa", "5f47532448504e4c8398688d85787201", "0e8901c365954716932d4a09bb5e615f", "7527f720a46a4ed384f09866b6550e6e", "d5df03c93765465fb8bc49dc19c88786" ] }, "id": "ZJdVWd7poxN4", "outputId": "9ff5bcae-647b-4a82-8d0d-17b116d29fa6" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Epoch 1: 0%| | 0/935 [00:00