|
1 | 1 | {
|
2 | 2 | "metadata": {
|
3 | 3 | "name": "",
|
4 |
| - "signature": "sha256:0c9d8c8b65b0eec5bb7c2a2790f08a1e49daf27dac2c9dcfe8d85ce958046a2c" |
| 4 | + "signature": "sha256:714a46a359c5b1c3e7e7bd4d19d73221f9def5bcb806840be82541070041d29e" |
5 | 5 | },
|
6 | 6 | "nbformat": 3,
|
7 | 7 | "nbformat_minor": 0,
|
|
57 | 57 | "- [Differences between 2 files](#Differences-between-2-files)\n",
|
58 | 58 | "- [Differences between successive elements in a list](#Differences-between-successive-elements-in-a-list)\n",
|
59 | 59 | "- [Doctest example](#Doctest-example)\n",
|
| 60 | + "- [English language detection](#English-language-detection)\n", |
6
10000
0 | 61 | "- [File browsing basics](#File-browsing-basics)\n",
|
61 | 62 | "- [File reading basics](#File-reading-basics)\n",
|
62 | 63 | "- [Indices of min and max elements from a list](#Indices-of-min-and-max-elements-from-a-list)\n",
|
|
595 | 596 | "<br>"
|
596 | 597 | ]
|
597 | 598 | },
|
| 599 | + { |
| 600 | + "cell_type": "heading", |
| 601 | + "level": 2, |
| 602 | + "metadata": {}, |
| 603 | + "source": [ |
| 604 | + "English language detection" |
| 605 | + ] |
| 606 | + }, |
| 607 | + { |
| 608 | + "cell_type": "markdown", |
| 609 | + "metadata": {}, |
| 610 | + "source": [ |
| 611 | + "[back to top](#Table-of-Contents)" |
| 612 | + ] |
| 613 | + }, |
| 614 | + { |
| 615 | + "cell_type": "code", |
| 616 | + "collapsed": false, |
| 617 | + "input": [ |
| 618 | + "import nltk\n", |
| 619 | + "\n", |
| 620 | + "def eng_ratio(text):\n", |
| 621 | + " ''' Returns the ratio of non-English to English words from a text '''\n", |
| 622 | + "\n", |
| 623 | + " english_vocab = set(w.lower() for w in nltk.corpus.words.words()) \n", |
| 624 | + " text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha()) \n", |
| 625 | + " unusual = text_vocab.difference(english_vocab)\n", |
| 626 | + " diff = len(unusual)/len(text_vocab)\n", |
| 627 | + " return diff\n", |
| 628 | + " \n", |
| 629 | + "text = 'This is a test fahrrad'\n", |
| 630 | + "\n", |
| 631 | + "print(eng_ratio(text))" |
| 632 | + ], |
| 633 | + "language": "python", |
| 634 | + "metadata": {}, |
| 635 | + "outputs": [ |
| 636 | + { |
| 637 | + "output_type": "stream", |
| 638 | + "stream": "stdout", |
| 639 | + "text": [ |
| 640 | + "0.2\n" |
| 641 | + ] |
| 642 | + } |
| 643 | + ], |
| 644 | + "prompt_number": 1 |
| 645 | + }, |
| 646 | + { |
| 647 | + "cell_type": "markdown", |
| 648 | + "metadata": {}, |
| 649 | + "source": [ |
| 650 | + "<br>\n", |
| 651 | + "<br>" |
| 652 | + ] |
| 653 | + }, |
598 | 654 | {
|
599 | 655 | "cell_type": "heading",
|
600 | 656 | "level": 2,
|
|
0 commit comments