From 1772bb67cea80e4d9abc8cce5a5db9ed9d0559f6 Mon Sep 17 00:00:00 2001 From: Luis Garcia Date: Fri, 12 Jul 2024 12:07:48 -0500 Subject: [PATCH] docusaurus plugin client redirects installed, blog posts deleted and redirected to the new ones in the marketing site --- docs/blog/2024-05-13-soc2-announcement.md | 47 ------------- docs/blog/2024-05-15-active-learning.md | 86 ----------------------- docs/docusaurus.config.js | 17 +++++ docs/package-lock.json | 24 +++++++ docs/package.json | 1 + 5 files changed, 42 insertions(+), 133 deletions(-) delete mode 100644 docs/blog/2024-05-13-soc2-announcement.md delete mode 100644 docs/blog/2024-05-15-active-learning.md diff --git a/docs/blog/2024-05-13-soc2-announcement.md b/docs/blog/2024-05-13-soc2-announcement.md deleted file mode 100644 index 295a60ae..00000000 --- a/docs/blog/2024-05-13-soc2-announcement.md +++ /dev/null @@ -1,47 +0,0 @@ ---- -title: Groundlight AI Achieves SOC 2 Type 2 Compliance -description: Groundlight AI Achieves SOC 2 Type 2 Compliance -slug: groundlight-ai-achieves-soc-2-type-2-compliance -authors: - - name: Phillipie Motley - title: Operations Manager - image_url: https://a-us.storyblok.com/f/1015187/1000x1000/1902e83e56/motleyp.jpg -tags: [soc-2] -hide_table_of_contents: false ---- - -# Groundlight AI Is Now SOC 2 Type 2 Compliant - -At Groundlight, we take data security and privacy extremely seriously. From the very beginning, we recognized the importance of implementing stringent controls and processes to safeguard our clients' sensitive information. That's why we made the decision early on to pursue SOC 2 compliance. - - - -## What is SOC 2 Verification - -For those unfamiliar, SOC 2 (Service Organization Control 2) is an auditing framework established by the American Institute of Certified Public Accountants (AICPA). It involves an in-depth external review of an organization's security policies, procedures, and controls by an independent auditor. Achieving SOC 2 certification demonstrates our unwavering commitment to maintaining the highest standards of data protection and privacy. - -### What Are the Different SOC 2 Types? - -**SOC 2 Type 1:** Evaluates an organization's cybersecurity controls at a single point in time. - -**SOC 2 Type 2:** Type 2 report assesses the operational effectiveness of controls over a defined period of time (3, 6, 12 months). - -## How Did Groundlight Achieve SOC 2 Compliance - -Achieving SOC 2 compliance is a marathon, not a sprint. It demands meticulous planning and dedication from teams across the entire organization. At Groundlight, we took a methodical approach by first establishing an audit timeline. From there, we worked backwards systematically to get our house in order. -Teams across engineering, security, operations, and more collaborated to implement rigorous security policies and controls. We overhauled processes for everything from access management to incident response handling. Robust evidence collection and documentation mechanisms were put into place. -Once we had thoroughly prepared, we brought in external auditors to conduct their independent evaluation. This was the high-stakes final exam. Our policies, technical safeguards, and control operations were stress-tested and scrutinized over an extended period. - -## What Does SOC 2 Verification Mean for Groundlight AI’s Data Security - -From day one, Groundlight has made data security and privacy a top priority. Safeguarding our customers' sensitive information is foundational to our business. So while achieving SOC 2 certification marks an important milestone, it simply reinforces practices that have been ingrained in our DNA all along. -We've never treated security as an afterthought or box to check. Instead, we've embraced building robust data protections into the core of our products and services from the ground up. Our policies, processes and technical controls are meticulously tailored to our unique operations - not generic one-size-fits-all measures. -SOC 2 compliance validates that we've institutionalized this security-first mindset across the entire organization. But it's just one step along our continuous journey. As data privacy regulations evolve and new threats emerge, we'll remain vigilant in regularly reassessing and elevating our safeguards. -By upholding the highest standards like SOC 2, we solidify the unshakable foundation of trust with our customers. Upholding these compliance standards unlocks new business opportunities and allows us to double down on our commitment to being steadfast in data security. - -## Key Takeaways of SOC 2 Verification - -- Prioritizing security and tailoring controls to our needs, not just checking boxes -- SOC 2 enables new business growth by meeting vendor security requirements -- Earning certification required full organizational commitment and stakeholder participation -- This marks an important milestone, but our security journey is never complete diff --git a/docs/blog/2024-05-15-active-learning.md b/docs/blog/2024-05-15-active-learning.md deleted file mode 100644 index 745bd07d..00000000 --- a/docs/blog/2024-05-15-active-learning.md +++ /dev/null @@ -1,86 +0,0 @@ ---- -title: "Reducing Data Labeling Costs with Uncertainty Sampling" -description: How Groundlight uses active learning to train accurate vision models while saving on data labeling costs. -slug: active-learning -authors: - - name: Ted Sandler - title: Senior Applied Scientist - image_url: https://a-us.storyblok.com/f/1015187/1000x1000/efc35da152/sandlert.jpg -tags: [active learning, uncertainty sampling, deep dive] -image: ./images/active-learning/dog-conf-high.png -hide_table_of_contents: false ---- - -At Groundlight, we train each detector's machine learning (ML) model on images that have been manually labeled with correct responses. However, collecting labels at scale becomes expensive because it requires human review. Given that detectors are frequently applied to streams of images that change slowly over time, reviewing all images as they arrive is likely to result in effort wasted on labeling similar images that add little information to the training set. - - - -## What is Active Learning in Machine Learning? - -To avoid unnecessary labeling and save customers money, Groundlight uses **[active learning](https://en.wikipedia.org/wiki/Active_learning_(machine_learning))**, a machine learning protocol in which the ML model plays an active role in determining which images get manually labeled for training. With active learning, only informative images are prioritized for review, making it possible to label small a subset of the available data but train a model that's roughly as good as one trained with all the data labeled [\[Settles, 2009\]](https://minds.wisconsin.edu/handle/1793/60660). - -## What is Uncertainty Sampling? - -The variant of active learning we use at Groundlight is based on **[uncertainty sampling](https://lilianweng.github.io/posts/2022-02-20-active-learning/#uncertainty-sampling)**, a well studied and effective method that can be used in either the streaming setting or the pool-based setting in which there exists a large reservoir of unlabeled examples to draw from. We operate in the stream-based setting, where images arrive one at a time and it must be decided in the moment whether to escalate an image for review. - -## How Does Uncertainty Sampling Work? - -Imagine we have a detector that processes a stream of images arriving one by one. The detector's ML model is trained on all images labeled up to that point in time. When a new image arrives, the model makes its best guess prediction for the new image and also reports its confidence in that prediction. The confidence is expressed as a probability (a number between zero and one) that the prediction is correct. - -In uncertainty sampling, we escalate those images whose predictions have low confidence so they can be manually reviewed and labeled. Conversely, we largely leave images with confident predictions unescalated and therefore unlabeled. In this way, we avoid the expense and effort of labeling images whose predictions are likely correct. But we still continue to label images the model is unsure of so it can improve on them. - -## An Example of Uncertainty Sampling - -As an example, the images shown below were sent to a detector that identifies the presence of dogs in and around a swimming pool at [Dogmode's Aquatic Center](https://dogmode.com/aquatic-fitness-center-pool-view/). The model reports with 95% confidence that there is a dog in the image on the left. But it is less confident in its response for the image on the right, saying there is no dog present with only 75% confidence. (There is in fact a dog at the back left corner of the pool, but it’s difficult to see.) - -
-
- - - - - -
YesNo
- -
- -Assuming the detector's confidence threshold is set to a value between 75% and 95%, uncertainty sampling will escalate the image on the right for cloud labeling but not the one on the left. A user can set their detector's confidence threshold by adjusting the confidence threshold slider on the detector detail page. The image below shows this slider. - -
- -## Experiment with Uncertainty Sampling - -We now present results from a time-series experiment on images collected and labeled for the purpose of measuring uncertainty sampling's impact on model accuracy and labeling cost. There are 500 images of a gate, and the task is to determine in every image if the gate has been left open or closed. All images in the experiment are labeled so we know the correct responses. But note[^1] that this would not be the case if we were running active learning in real life because, by design, active learning does not recruit labels on high confidence images. - -[^1]: In practice, we audit a constant fraction of confidently predicted images for review that serve as an additional source of labeled data. - - -Our results compare the performance of three models trained under different protocols: -1. No uncertainty sampling, all images are escalated for manual review and labeling -2. Moderate uncertainty sampling, predictions less than 95% confident get escalated -3. Aggressive uncertainty sampling, only predictions below 75% confidence are escalated - -The training sets of all three models are initialized with the same 20 images, 10 labeled from each class. - -The plot below shows that the model trained with moderate uncertainty sampling (confidence threshold 95%) has an error rate similar to the model trained without any uncertainty sampling. This demonstrates that uncertainty sampling can fit a model as accurately as labeling and training on all the available data. - - -On the other hand, aggressive uncertainty sampling (confidence threshold 75%) escalates too few images for labeling, resulting in a model trained on less data which makes more mistakes. This shows how the confidence threshold controls the trade off between model accuracy and labeling costs from manual labeling. Indirectly, it also demonstrates the need for calibrating models so their reported confidences reflect observed frequencies and can be used for making decisions about when to escalate. We calibrate machine learning models at Groundlight though the details are beyond the scope of this post. - -Strikingly, plotting the number of images escalated by each model shows that active learning dramatically reduces labeling costs. The model trained without uncertainty sampling escalates all 500 images for review manual review. In contrast, the model trained with moderate uncertainty sampling escalates only 132 images in total. This is a nearly 75% reduction in manual labeling and cost with little change in model error. Aggressive uncertainty sampling escalates even fewer images, only 60, but the resulting model has noticeably higher error as observed in the plot above. - - -## Conclusion - -At Groundlight, we use active learning to reduce labeling costs for our customers. In particular, we use a variant based on uncertainty sampling that is extremely effective and easy to explain. A small experiment on an image time-series dataset shows that uncertainty sampling can dramatically reduce the number of images labeled without impacting model accuracy. If you want to learn more about active learning and its various formulations, definitely check out the references below. - -## References - -1. [Settles, Burr. *Active learning literature survey*. University of Wisconsin-Madison Department of Computer Sciences, 2009.](https://minds.wisconsin.edu/handle/1793/60660) -2. [Weng, Lilian. "Learning with not Enough Data Part 2: Active Learning." Lil'Log, February 20 2022. April 29 2024.](https://lilianweng.github.io/posts/2022-02-20-active-learning/) diff --git a/docs/docusaurus.config.js b/docs/docusaurus.config.js index c4c8fa71..58cd2e0e 100644 --- a/docs/docusaurus.config.js +++ b/docs/docusaurus.config.js @@ -209,6 +209,23 @@ const config = { darkTheme: darkCodeTheme, }, }), + plugins: [ + [ + "@docusaurus/plugin-client-redirects", + { + redirects: [ + { + to: "https://www.groundlight.ai/blog/reducing-data-labeling-costs-with-uncertainty-sampling", // new marketing site route + from: "/blog/active-learning", // old blog route + }, + { + to: "https://www.groundlight.ai/blog/groundlight-ai-achieves-soc-2-type-2-compliance", // new marketing site route + from: "/blog/groundlight-ai-achieves-soc-2-type-2-compliance", // old blog route + }, + ], + }, + ], + ], }; module.exports = config; \ No newline at end of file diff --git a/docs/package-lock.json b/docs/package-lock.json index 9cba63f6..4d5d6edf 100644 --- a/docs/package-lock.json +++ b/docs/package-lock.json @@ -9,6 +9,7 @@ "version": "0.0.0", "dependencies": { "@docusaurus/core": "3.0.0", + "@docusaurus/plugin-client-redirects": "^3.0.0", "@docusaurus/preset-classic": "3.0.0", "@easyops-cn/docusaurus-search-local": "^0.38.0", "@mdx-js/react": "^3.0.0", @@ -2356,6 +2357,29 @@ "react-dom": "*" } }, + "node_modules/@docusaurus/plugin-client-redirects": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/@docusaurus/plugin-client-redirects/-/plugin-client-redirects-3.0.0.tgz", + "integrity": "sha512-JcZLod4lgPdbv/OpCbNwTc57u54d01dcWiDy/sBaxls/4HkDGdj6838oBPzbBdnCWrmasBIRz3JYLk+1GU0IOQ==", + "dependencies": { + "@docusaurus/core": "3.0.0", + "@docusaurus/logger": "3.0.0", + "@docusaurus/utils": "3.0.0", + "@docusaurus/utils-common": "3.0.0", + "@docusaurus/utils-validation": "3.0.0", + "eta": "^2.2.0", + "fs-extra": "^11.1.1", + "lodash": "^4.17.21", + "tslib": "^2.6.0" + }, + "engines": { + "node": ">=18.0" + }, + "peerDependencies": { + "react": "^18.0.0", + "react-dom": "^18.0.0" + } + }, "node_modules/@docusaurus/plugin-content-blog": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/@docusaurus/plugin-content-blog/-/plugin-content-blog-3.0.0.tgz", diff --git a/docs/package.json b/docs/package.json index 83f7c3cb..7ba32434 100644 --- a/docs/package.json +++ b/docs/package.json @@ -16,6 +16,7 @@ }, "dependencies": { "@docusaurus/core": "3.0.0", + "@docusaurus/plugin-client-redirects": "^3.0.0", "@docusaurus/preset-classic": "3.0.0", "@easyops-cn/docusaurus-search-local": "^0.38.0", "@mdx-js/react": "^3.0.0",