From 5fb9defe08e429bf3ac996f03a9d4f41aa7e7324 Mon Sep 17 00:00:00 2001 From: Jerjou Cheng Date: Wed, 6 Apr 2016 11:11:12 -0700 Subject: [PATCH 01/10] Add samples for natural language api. Change-Id: I089b244f1a0a2210fffab6f5747eed9f6150147a --- .coveragerc | 1 + conftest.py | 24 +- language/README.md | 17 + language/api/README.md | 87 +++++ language/api/analyze.py | 115 ++++++ language/api/analyze_test.py | 258 +++++++++++++ language/api/requirements.txt | 1 + language/movie_nl/README.md | 154 ++++++++ language/movie_nl/main.py | 346 +++++++++++++++++ language/movie_nl/main_test.py | 82 ++++ language/movie_nl/requirements.txt | 3 + language/ocr_nl/README.md | 227 +++++++++++ language/ocr_nl/main.py | 362 ++++++++++++++++++ language/ocr_nl/main_test.py | 97 +++++ language/ocr_nl/requirements.txt | 1 + language/syntax_triples/README.md | 91 +++++ language/syntax_triples/main.py | 180 +++++++++ language/syntax_triples/main_test.py | 50 +++ language/syntax_triples/requirements.txt | 1 + .../resources/obama_wikipedia.txt | 1 + 20 files changed, 2097 insertions(+), 1 deletion(-) create mode 100644 language/README.md create mode 100644 language/api/README.md create mode 100644 language/api/analyze.py create mode 100644 language/api/analyze_test.py create mode 100644 language/api/requirements.txt create mode 100644 language/movie_nl/README.md create mode 100644 language/movie_nl/main.py create mode 100644 language/movie_nl/main_test.py create mode 100644 language/movie_nl/requirements.txt create mode 100644 language/ocr_nl/README.md create mode 100755 language/ocr_nl/main.py create mode 100755 language/ocr_nl/main_test.py create mode 100644 language/ocr_nl/requirements.txt create mode 100644 language/syntax_triples/README.md create mode 100644 language/syntax_triples/main.py create mode 100755 language/syntax_triples/main_test.py create mode 100644 language/syntax_triples/requirements.txt create mode 100644 language/syntax_triples/resources/obama_wikipedia.txt diff --git a/.coveragerc b/.coveragerc index a0523ed4a49..cb53bfdaf96 100644 --- a/.coveragerc +++ b/.coveragerc @@ -9,6 +9,7 @@ include = dns/* datastore/* error_reporting/* + language/* managed_vms/* monitoring/* speech/* diff --git a/conftest.py b/conftest.py index 14876c24eac..3fa68de6953 100644 --- a/conftest.py +++ b/conftest.py @@ -15,9 +15,10 @@ import os import pytest +import requests -class Namespace: +class Namespace(object): def __init__(self, **kwargs): self.__dict__.update(kwargs) @@ -48,3 +49,24 @@ def resource(request): testing resource""" local_path = os.path.dirname(request.module.__file__) return lambda *args: get_resource_path(args, local_path) + + +def fetch_gcs_resource(resource, tmpdir, _chunk_size=1024): + resp = requests.get(resource, stream=True) + dest_file = str(tmpdir.join(os.path.basename(resource))) + with open(dest_file, 'wb') as f: + for chunk in resp.iter_content(_chunk_size): + f.write(chunk) + + return dest_file + + +@pytest.fixture(scope='module') +def remote_resource(cloud_config): + """Provides a function that downloads the given resource from Cloud + Storage, returning the path to the downloaded resource.""" + remote_uri = 'http://storage.googleapis.com/{}/'.format( + cloud_config.storage_bucket) + + return lambda path, tmpdir: fetch_gcs_resource( + remote_uri + path.strip('/'), tmpdir) diff --git a/language/README.md b/language/README.md new file mode 100644 index 00000000000..e63d45eb9a6 --- /dev/null +++ b/language/README.md @@ -0,0 +1,17 @@ +# Google Cloud Natural Language API examples + +This directory contains Python examples that use the +[Google Cloud Natural Language API](https://cloud.google.com/natural-language/). + +- [api](api) has a simple command line tool that shows off the API's features. + +- [movie_nl](movie_nl) combines sentiment and entity analysis to come up with +actors/directors who are the most and least popular in the imdb movie reviews. + +- [ocr_nl](ocr_nl) uses the [Cloud Vision API](https://cloud.google.com/vision/) +to extract text from images, then uses the NL API to extract entity information +from those texts, and stores the extracted information in a database in support +of further analysis and correlation. + +- [syntax_triples](syntax_triples) uses syntax analysis to find +subject-verb-object triples in a given piece of text. diff --git a/language/api/README.md b/language/api/README.md new file mode 100644 index 00000000000..9625df30c89 --- /dev/null +++ b/language/api/README.md @@ -0,0 +1,87 @@ + +# Google Cloud Natural Language API Sample + +This Python sample demonstrates the use of the [Google Cloud Natural Language API][NL-Docs] +for sentiment, entity, and syntax analysis. + +[NL-Docs]: https://cloud.google.com/natural-language/docs/ + +## Setup + +Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) +steps in the Quickstart doc to create a project and enable the +Cloud Natural Language API. Following those steps, make sure that you +[Set Up a Service Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), +and export the following environment variable: + +``` +export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json +``` + +## Run the sample + +Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. + +To run the example, install the necessary libraries using pip: + +```sh +$ pip install -r requirements.txt +``` + +Then, run the script: + +```sh +$ python analyze.py +``` + +where `` is one of: `entities`, `sentiment`, or `syntax`. + +The script will write to STDOUT the json returned from the API for the requested feature. + +For example, if you run: + +```sh +$ python analyze.py entities "Tom Sawyer is a book written by a guy known as Mark Twain." +``` + +You will see something like the following returned: + +``` +{ + "entities": [ + { + "salience": 0.49785897, + "mentions": [ + { + "text": { + "content": "Tom Sawyer", + "beginOffset": 0 + } + } + ], + "type": "PERSON", + "name": "Tom Sawyer", + "metadata": { + "wikipedia_url": "http://en.wikipedia.org/wiki/The_Adventures_of_Tom_Sawyer" + } + }, + { + "salience": 0.12209519, + "mentions": [ + { + "text": { + "content": "Mark Twain", + "beginOffset": 47 + } + } + ], + "type": "PERSON", + "name": "Mark Twain", + "metadata": { + "wikipedia_url": "http://en.wikipedia.org/wiki/Mark_Twain" + } + } + ], + "language": "en" +} +``` diff --git a/language/api/analyze.py b/language/api/analyze.py new file mode 100644 index 00000000000..73e892c354a --- /dev/null +++ b/language/api/analyze.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python + +# Copyright 2016 Google, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Analyzes text using the Google Cloud Natural Language API.""" + +import argparse +import json +import sys + +from googleapiclient import discovery +import httplib2 +from oauth2client.client import GoogleCredentials + + +def get_service(): + credentials = GoogleCredentials.get_application_default() + scoped_credentials = credentials.create_scoped( + ['https://www.googleapis.com/auth/cloud-platform']) + http = httplib2.Http() + scoped_credentials.authorize(http) + return discovery.build('language', 'v1beta1', http=http) + + +def get_native_encoding_type(): + """Returns the encoding type that matches Python's native strings.""" + if sys.maxunicode == 65535: + return 'UTF16' + else: + return 'UTF32' + + +def analyze_entities(text, encoding='UTF32'): + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'encodingType': encoding, + } + + service = get_service() + + request = service.documents().analyzeEntities(body=body) + response = request.execute() + + return response + + +def analyze_sentiment(text): + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + } + } + + service = get_service() + + request = service.documents().analyzeSentiment(body=body) + response = request.execute() + + return response + + +def analyze_syntax(text, encoding='UTF32'): + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'features': { + 'extract_syntax': True, + }, + 'encodingType': encoding, + } + + service = get_service() + + request = service.documents().annotateText(body=body) + response = request.execute() + + return response + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('command', choices=[ + 'entities', 'sentiment', 'syntax']) + parser.add_argument('text') + + args = parser.parse_args() + + if args.command == 'entities': + result = analyze_entities(args.text, get_native_encoding_type()) + elif args.command == 'sentiment': + result = analyze_sentiment(args.text) + elif args.command == 'syntax': + result = analyze_syntax(args.text, get_native_encoding_type()) + + print(json.dumps(result, indent=2)) diff --git a/language/api/analyze_test.py b/language/api/analyze_test.py new file mode 100644 index 00000000000..11b0d65d629 --- /dev/null +++ b/language/api/analyze_test.py @@ -0,0 +1,258 @@ +# Copyright 2016, Google, Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import textwrap + +import analyze + + +def test_analyze_entities(): + result = analyze.analyze_entities( + 'Tom Sawyer is a book written by a guy known as Mark Twain.') + + assert result['language'] == 'en' + entities = result['entities'] + assert len(entities) + subject = entities[0] + assert subject['type'] == 'PERSON' + assert subject['name'].startswith('Tom') + + +def test_analyze_sentiment(capsys): + result = analyze.analyze_sentiment( + 'your face is really ugly and i hate it.') + + sentiment = result['documentSentiment'] + assert sentiment['polarity'] < 0 + assert sentiment['magnitude'] < 1 + + result = analyze.analyze_sentiment( + 'cheerio, mate - I greatly admire the pallor of your visage, and your ' + 'angle of repose leaves little room for improvement.') + + sentiment = result['documentSentiment'] + assert sentiment['polarity'] > 0 + assert sentiment['magnitude'] < 1 + + +def test_analyze_syntax(capsys): + result = analyze.analyze_syntax(textwrap.dedent(u'''\ + Keep away from people who try to belittle your ambitions. Small people + always do that, but the really great make you feel that you, too, can + become great. + - Mark Twain''')) + + assert len(result['tokens']) + first_token = result['tokens'][0] + assert first_token['text']['content'] == 'Keep' + assert first_token['partOfSpeech']['tag'] == 'VERB' + assert len(result['sentences']) > 1 + assert result['language'] == 'en' + + +def test_analyze_syntax_utf8(): + """Demonstrate the interpretation of the offsets when encoding=utf8. + + UTF8 is a variable-length encoding, where each character is at least 8 + bits. The offsets we get should be the index of the first byte of the + character. + """ + test_string = u'a \u00e3 \u0201 \U0001f636 b' + byte_array = test_string.encode('utf8') + result = analyze.analyze_syntax(test_string, encoding='UTF8') + tokens = result['tokens'] + + assert tokens[0]['text']['content'] == 'a' + offset = tokens[0]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset+1].decode('utf8') == + tokens[0]['text']['content']) + + assert tokens[1]['text']['content'] == u'\u00e3' + offset = tokens[1]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset+2].decode('utf8') == + tokens[1]['text']['content']) + + assert tokens[2]['text']['content'] == u'\u0201' + offset = tokens[2]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset+2].decode('utf8') == + tokens[2]['text']['content']) + + assert tokens[3]['text']['content'] == u'\U0001f636' + offset = tokens[3]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset+4].decode('utf8') == + tokens[3]['text']['content']) + + # This demonstrates that the offset takes into account the variable-length + # characters before the target token. + assert tokens[4]['text']['content'] == u'b' + offset = tokens[4]['text'].get('beginOffset', 0) + # 'b' is only one byte long + assert (byte_array[offset:offset+1].decode('utf8') == + tokens[4]['text']['content']) + + +def test_analyze_syntax_utf16(): + """Demonstrate the interpretation of the offsets when encoding=utf16. + + UTF16 is a variable-length encoding, where each character is at least 16 + bits. The returned offsets will be the index of the first 2-byte character + of the token. + """ + test_string = u'a \u00e3 \u0201 \U0001f636 b' + byte_array = test_string.encode('utf16') + # Remove the byte order marker, which the offsets don't account for + byte_array = byte_array[2:] + result = analyze.analyze_syntax(test_string, encoding='UTF16') + tokens = result['tokens'] + + assert tokens[0]['text']['content'] == 'a' + # The offset is an offset into an array where each entry is 16 bits. Since + # we have an 8-bit array, the offsets should be doubled to index into our + # array. + offset = 2 * tokens[0]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset + 2].decode('utf16') == + tokens[0]['text']['content']) + + assert tokens[1]['text']['content'] == u'\u00e3' + offset = 2 * tokens[1]['text'].get('beginOffset', 0) + # A UTF16 character with a low codepoint is 16 bits (2 bytes) long, so + # slice out 2 bytes starting from the offset. Then interpret the bytes as + # utf16 for comparison. + assert (byte_array[offset:offset + 2].decode('utf16') == + tokens[1]['text']['content']) + + assert tokens[2]['text']['content'] == u'\u0201' + offset = 2 * tokens[2]['text'].get('beginOffset', 0) + # A UTF16 character with a low codepoint is 16 bits (2 bytes) long, so + # slice out 2 bytes starting from the offset. Then interpret the bytes as + # utf16 for comparison. + assert (byte_array[offset:offset + 2].decode('utf16') == + tokens[2]['text']['content']) + + assert tokens[3]['text']['content'] == u'\U0001f636' + offset = 2 * tokens[3]['text'].get('beginOffset', 0) + # A UTF16 character with a high codepoint is 32 bits (4 bytes) long, so + # slice out 4 bytes starting from the offset. Then interpret those bytes as + # utf16 for comparison. + assert (byte_array[offset:offset + 4].decode('utf16') == + tokens[3]['text']['content']) + + # This demonstrates that the offset takes into account the variable-length + # characters before the target token. + assert tokens[4]['text']['content'] == u'b' + offset = 2 * tokens[4]['text'].get('beginOffset', 0) + # Even though 'b' is only one byte long, utf16 still encodes it using 16 + # bits + assert (byte_array[offset:offset + 2].decode('utf16') == + tokens[4]['text']['content']) + + +def test_annotate_text_utf32(): + """Demonstrate the interpretation of the offsets when encoding=utf32. + + UTF32 is a fixed-length encoding, where each character is exactly 32 bits. + The returned offsets will be the index of the first 4-byte character + of the token. + + Python unicode objects index by the interpreted unicode character. This + means a given unicode character only ever takes up one slot in a unicode + string. This is equivalent to indexing into a UTF32 string, where all + characters are a fixed length and thus will only ever take up one slot. + + Thus, if you're indexing into a python unicode object, you can set + encoding to UTF32 to index directly into the unicode object (as opposed to + the byte arrays, as these examples do). + + Nonetheless, this test still demonstrates indexing into the byte array, for + consistency. Note that you could just index into the origin test_string + unicode object with the raw offset returned by the api (ie without + multiplying it by 4, as it is below). + """ + test_string = u'a \u00e3 \u0201 \U0001f636 b' + byte_array = test_string.encode('utf32') + # Remove the byte order marker, which the offsets don't account for + byte_array = byte_array[4:] + result = analyze.analyze_syntax(test_string, encoding='UTF32') + tokens = result['tokens'] + + assert tokens[0]['text']['content'] == 'a' + # The offset is an offset into an array where each entry is 32 bits. Since + # we have an 8-bit array, the offsets should be quadrupled to index into + # our array. + offset = 4 * tokens[0]['text'].get('beginOffset', 0) + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[0]['text']['content']) + + assert tokens[1]['text']['content'] == u'\u00e3' + offset = 4 * tokens[1]['text'].get('beginOffset', 0) + # A UTF32 character with a low codepoint is 32 bits (4 bytes) long, so + # slice out 4 bytes starting from the offset. Then interpret the bytes as + # utf32 for comparison. + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[1]['text']['content']) + + assert tokens[2]['text']['content'] == u'\u0201' + offset = 4 * tokens[2]['text'].get('beginOffset', 0) + # A UTF32 character with a low codepoint is 32 bits (4 bytes) long, so + # slice out 4 bytes starting from the offset. Then interpret the bytes as + # utf32 for comparison. + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[2]['text']['content']) + + assert tokens[3]['text']['content'] == u'\U0001f636' + offset = 4 * tokens[3]['text'].get('beginOffset', 0) + # A UTF32 character with a high codepoint is 32 bits (4 bytes) long, so + # slice out 4 bytes starting from the offset. Then interpret those bytes as + # utf32 for comparison. + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[3]['text']['content']) + + # This demonstrates that the offset takes into account the variable-length + # characters before the target token. + assert tokens[4]['text']['content'] == u'b' + offset = 4 * tokens[4]['text'].get('beginOffset', 0) + # Even though 'b' is only one byte long, utf32 still encodes it using 32 + # bits + assert (byte_array[offset:offset + 4].decode('utf32') == + tokens[4]['text']['content']) + + +def test_annotate_text_utf32_directly_index_into_unicode(): + """Demonstrate using offsets directly, using encoding=utf32. + + See the explanation for test_annotate_text_utf32. Essentially, indexing + into a utf32 array is equivalent to indexing into a python unicode object. + """ + test_string = u'a \u00e3 \u0201 \U0001f636 b' + result = analyze.analyze_syntax(test_string, encoding='UTF32') + tokens = result['tokens'] + + assert tokens[0]['text']['content'] == 'a' + offset = tokens[0]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[0]['text']['content'] + + assert tokens[1]['text']['content'] == u'\u00e3' + offset = tokens[1]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[1]['text']['content'] + + assert tokens[2]['text']['content'] == u'\u0201' + offset = tokens[2]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[2]['text']['content'] + + assert tokens[3]['text']['content'] == u'\U0001f636' + offset = tokens[3]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[3]['text']['content'] + + assert tokens[4]['text']['content'] == u'b' + offset = tokens[4]['text'].get('beginOffset', 0) + assert test_string[offset] == tokens[4]['text']['content'] diff --git a/language/api/requirements.txt b/language/api/requirements.txt new file mode 100644 index 00000000000..0b96c82ee4c --- /dev/null +++ b/language/api/requirements.txt @@ -0,0 +1 @@ +google-api-python-client==1.5.1 diff --git a/language/movie_nl/README.md b/language/movie_nl/README.md new file mode 100644 index 00000000000..b651dee8bb7 --- /dev/null +++ b/language/movie_nl/README.md @@ -0,0 +1,154 @@ +# Introduction +This sample is an application of the Google Cloud Platform Natural Language API. +It uses the [imdb movie reviews data set](https://www.cs.cornell.edu/people/pabo/movie-review-data/) +from [Cornell University](http://www.cs.cornell.edu/) and performs sentiment & entity +analysis on it. It combines the capabilities of sentiment analysis and entity recognition +to come up with actors/directors who are the most and least popular. + +### Set Up to Authenticate With Your Project's Credentials + +Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) +steps in the Quickstart doc to create a project and enable the +Cloud Natural Language API. Following those steps, make sure that you +[Set Up a Service Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), +and export the following environment variable: + +``` +export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json +``` + +**Note:** If you get an error saying your API hasn't been enabled, make sure +that you have correctly set this environment variable, and that the project that +you got the service account from has the Natural Language API enabled. + +## How it works +This sample uses the Natural Language API to annotate the input text. The +movie review document is broken into sentences using the `extract_syntax` feature. +Each sentence is sent to the API for sentiment analysis. The positive and negative +sentiment values are combined to come up with a single overall sentiment of the +movie document. + +In addition to the sentiment, the program also extracts the entities of type +`PERSON`, who are the actors in the movie (including the director and anyone +important). These entities are assigned the sentiment value of the document to +come up with the most and least popular actors/directors. + +### Movie document +We define a movie document as a set of reviews. These reviews are individual +sentences and we use the NL API to extract the sentences from the document. See +an example movie document below. + +``` + Sample review sentence 1. Sample review sentence 2. Sample review sentence 3. +``` + +### Sentences and Sentiment +Each sentence from the above document is assigned a sentiment as below. + +``` + Sample review sentence 1 => Sentiment 1 + Sample review sentence 2 => Sentiment 2 + Sample review sentence 3 => Sentiment 3 +``` + +### Sentiment computation +The final sentiment is computed by simply adding the sentence sentiments. + +``` + Total Sentiment = Sentiment 1 + Sentiment 2 + Sentiment 3 +``` + + +### Entity extraction and Sentiment assignment +Entities with type `PERSON` are extracted from the movie document using the NL +API. Since these entities are mentioned in their respective movie document, +they are associated with the document sentiment. + +``` + Document 1 => Sentiment 1 + + Person 1 + Person 2 + Person 3 + + Document 2 => Sentiment 2 + + Person 2 + Person 4 + Person 5 +``` + +Based on the above data we can calculate the sentiment associated with Person 2: + +``` + Person 2 => (Sentiment 1 + Sentiment 2) +``` + +## Movie Data Set +We have used the Cornell Movie Review data as our input. Please follow the instructions below to download and extract the data. + +### Download Instructions + +``` + $ curl -O http://www.cs.cornell.edu/people/pabo/movie-review-data/mix20_rand700_tokens.zip + $ unzip mix20_rand700_tokens.zip +``` + +## Command Line Usage +In order to use the movie analyzer, follow the instructions below. (Note that the `--sample` parameter below runs the script on +fewer documents, and can be omitted to run it on the entire corpus) + +### Install Dependencies + +Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. + +Then, install dependencies by running the following pip command: + +``` +$ pip install -r requirements.txt +``` +### How to Run + +``` +$ python main.py --inp "tokens/*/*" \ + --sout sentiment.json \ + --eout entity.json \ + --sample 5 +``` + +You should see the log file `movie.log` created. + +## Output Data +The program produces sentiment and entity output in json format. For example: + +### Sentiment Output +``` + { + "doc_id": "cv310_tok-16557.txt", + "sentiment": 3.099, + "label": -1 + } +``` + +### Entity Output + +``` + { + "name": "Sean Patrick Flanery", + "wiki_url": "http://en.wikipedia.org/wiki/Sean_Patrick_Flanery", + "sentiment": 3.099 + } +``` + +### Entity Output Sorting +In order to sort and rank the entities generated, use the same `main.py` script. For example, +this will print the top 5 actors with negative sentiment: + +``` +$ python main.py --inp entity.json \ + --sout sentiment.json \ + --eout entity.json \ + --sentiment neg \ + --reverse True \ + --sample 5 +``` diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py new file mode 100644 index 00000000000..380f495f967 --- /dev/null +++ b/language/movie_nl/main.py @@ -0,0 +1,346 @@ +# Copyright 2016 Google, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import codecs +import glob +import json +import logging +import os + +from googleapiclient import discovery +import httplib2 +from oauth2client.client import GoogleCredentials +import requests + + +def analyze_document(service, document): + """Analyze the document and get the distribution of sentiments and + the movie name.""" + logging.info('Analyzing {}'.format(document.doc_id)) + + sentences, entities = document.extract_all_sentences(service) + sentiments = [get_sentiment(service, sentence) for sentence in sentences] + + return sentiments, entities + + +def get_request_body(text, syntax=True, entities=True, sentiment=True): + """Creates the body of the request to the language api in + order to get an appropriate api response.""" + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'features': { + 'extract_syntax': syntax, + 'extract_entities': entities, + 'extract_document_sentiment': sentiment, + }, + 'encoding_type': 'UTF32' + } + + return body + + +def get_sentiment(service, sentence): + """Get the sentence-level sentiment.""" + body = get_request_body( + sentence, syntax=False, entities=True, sentiment=True) + + docs = service.documents() + request = docs.annotateText(body=body) + response = request.execute() + sentiment = response.get("documentSentiment") + + if sentiment is None: + return (None, None) + else: + pol = sentiment.get("polarity") + mag = sentiment.get("magnitude") + + if pol is None and mag is not None: + pol = 0 + return (pol, mag) + + +class Document(object): + """Document class captures a single document of movie reviews.""" + + def __init__(self, text, doc_id, doc_path): + self.text = text + self.doc_id = doc_id + self.doc_path = doc_path + self.sentent_pair = None + self.label = None + + def extract_all_sentences(self, service): + """Extract the sentences in a document.""" + + if self.sentent_pair is None: + docs = service.documents() + request_body = get_request_body( + self.text, + syntax=True, + entities=True, + sentiment=False) + request = docs.annotateText(body=request_body) + + ent_list = [] + + response = request.execute() + entities = response.get('entities', []) + sentences = response.get('sentences', []) + + sent_list = [ + sentence.get('text').get('content')for sentence in sentences + ] + + for entity in entities: + ent_type = entity.get('type') + wiki_url = entity.get('metadata', {}).get('wikipedia_url') + + if ent_type == 'PERSON' and wiki_url is not None: + ent_list.append(wiki_url) + + self.sentent_pair = (sent_list, ent_list) + + return self.sentent_pair + + +def to_sentiment_json(doc_id, sent, label): + """Convert the sentiment info to json.""" + json_doc = {} + + json_doc['doc_id'] = doc_id + json_doc['sentiment'] = float('%.3f' % sent) + json_doc['label'] = label + + return json.dumps(json_doc) + + +def get_wiki_title(wiki_url): + """Get the wikipedia page title for a given wikipedia URL.""" + try: + content = requests.get(wiki_url).text + return content.split('title')[1].split('-')[0].split('>')[1].strip() + except: + return os.path.basename(wiki_url).replace('_', ' ') + + +def to_entity_json(entity, e_tuple): + """Convert the entity info to json.""" + json_doc = {} + + avg_sentiment = float(e_tuple[0]) / float(e_tuple[1]) + + json_doc['wiki_url'] = entity + json_doc['name'] = get_wiki_title(entity) + json_doc['sentiment'] = float('%.3f' % e_tuple[0]) + json_doc['avg_sentiment'] = float('%.3f' % avg_sentiment) + + return json.dumps(json_doc) + + +def get_sentiment_entities(service, document): + """Compute the overall sentiment volume in the document""" + sentiments, entities = analyze_document(service, document) + + sentiments = [sent for sent in sentiments if sent[0] is not None] + negative_sentiments = [ + polarity for polarity, magnitude in sentiments if polarity < 0.0] + positive_sentiments = [ + polarity for polarity, magnitude in sentiments if polarity > 0.0] + + negative = sum(negative_sentiment) + positive = sum(positive_sentiment) + total = positive + negative + + return (total, entities) + + +def get_sentiment_label(sentiment): + """Return the sentiment label based on the sentiment quantity.""" + if sentiment < 0: + return -1 + elif sentiment > 0: + return 1 + else: + return 0 + + +def process_movie_reviews(service, reader, sentiment_writer, entity_writer): + """Perform some sentiment math and come up with movie review.""" + collected_entities = {} + + for document in reader: + try: + sentiment_total, entities = get_sentiment_entities( + service, document) + document.label = get_sentiment_label(sentiment_total) + + sentiment_writer.write( + to_sentiment_json( + document.doc_id, + sentiment_total, + document.label + ) + ) + + sentiment_writer.write('\n') + + for ent in entities: + ent_sent, frequency = collected_entities.get(ent, (0, 0)) + ent_sent += sentiment_total + frequency += 1 + + collected_entities[ent] = (ent_sent, frequency) + + except Exception: + logging.info('Skipping {}'.format(document.doc_id)) + + for entity, e_tuple in collected_entities.items(): + entity_writer.write(to_entity_json(entity, e_tuple)) + entity_writer.write('\n') + + sentiment_writer.flush() + entity_writer.flush() + + +def document_generator(dir_path_pattern, count=None): + """Generator for the input movie documents.""" + for running_count, item in enumerate(glob.iglob(dir_path_pattern)): + if count and running_count >= count: + raise StopIteration() + + doc_id = os.path.basename(item) + + with codecs.open(item, encoding='utf-8') as f: + try: + text = f.read() + except UnicodeDecodeError: + text = None + + yield Document(text, doc_id, item) + + +def rank_entities(reader, sentiment=None, topn=None, reverse_bool=False): + """Rank the entities (actors) based on their sentiment + assigned from the movie.""" + + items = [] + for item in reader: + json_item = json.loads(item) + sent = json_item.get('sentiment') + entity_item = (sent, json_item) + + if sentiment: + if sentiment == 'pos' and sent > 0: + items.append(entity_item) + elif sentiment == 'neg' and sent < 0: + items.append(entity_item) + else: + items.append(entity_item) + + items.sort(reverse=True) + items = [json.dumps(item[1]) for item in items] + + if reverse_bool: + items.reverse() + + if topn: + print('\n'.join(items[:topn])) + else: + print('\n'.join(items)) + + +def get_service(): + """Build a client to the Google Cloud Natural Language API.""" + + credentials = GoogleCredentials.get_application_default() + scoped_credentials = credentials.create_scoped( + ['https://www.googleapis.com/auth/cloud-platform']) + http = httplib2.Http() + scoped_credentials.authorize(http) + return discovery.build('language', 'v1beta1', http=http) + + +def main(input_dir, sent_out, ent_out, sample, log_file, + operation, sentiment, ent_in, reverse_bool): + """Movie demo main program""" + + sample = int(sample) if sample else None + + if operation == 'rank': + with open(ent_in) as reader: + rank_entities(reader, sentiment, sample, reverse_bool) + else: + # Create logger settings + logging.basicConfig(filename=log_file, level=logging.DEBUG) + + # Create a Google Service object + service = get_service() + + # Create a sentiment output writer + sentiment_writer = open(sent_out, 'w') + + # Create an entity output writer + entity_writer = open(ent_out, 'w') + + reader = document_generator(input_dir, sample) + + # Process the movie documents + process_movie_reviews(service, reader, sentiment_writer, entity_writer) + + # close reader and writers + sentiment_writer.close() + entity_writer.close() + reader.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('--inp', help='location of the input', required=True) + parser.add_argument( + '--sout', help='location of the sentiment output', required=True) + parser.add_argument( + '--eout', help='location of the entity output', required=True) + parser.add_argument('--sample', help='number of top items to process') + parser.add_argument( + '--op', + help='operation to perform "rank" or "analyze"', + default='analyze') + parser.add_argument( + '--sentiment', help='filter sentiment as "neg" or "pos"') + parser.add_argument( + '--ein', help='location of entity input') + parser.add_argument( + '--reverse', help='reverse the order of the items') + + args = parser.parse_args() + + log_file = 'movie.log' + + main(args.inp, + args.sout, + args.eout, + args.sample, + log_file, + args.op, + args.sentiment, + args.ein, + args.reverse) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py new file mode 100644 index 00000000000..96907908018 --- /dev/null +++ b/language/movie_nl/main_test.py @@ -0,0 +1,82 @@ +# Copyright 2016 Google, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import json + +import main + + +def test_get_request_body(): + text = 'hello world' + body = main.get_request_body(text, syntax=True, entities=True, + sentiment=False) + assert body.get('document').get('content') == text + + assert body.get('features').get('extract_syntax') is True + assert body.get('features').get('extract_entities') is True + assert body.get('features').get('extract_document_sentiment') is False + + +def test_get_sentiment_label(): + assert main.get_sentiment_label(20.50) == 1 + assert main.get_sentiment_label(-42.34) == -1 + + +def test_to_sentiment_json(): + doc_id = '12345' + sentiment = 23.344564 + label = 1 + + sentiment_json = json.loads( + main.to_sentiment_json(doc_id, sentiment, label) + ) + + assert sentiment_json.get('doc_id') == doc_id + assert sentiment_json.get('sentiment') == 23.345 + assert sentiment_json.get('label') == label + + +def test_process_movie_reviews(): + service = main.get_service() + + doc1 = main.Document('Top Gun was awesome and Tom Cruise rocked!', 'doc1', + 'doc1') + doc2 = main.Document('Tom Cruise is a great actor.', 'doc2', 'doc2') + + reader = [doc1, doc2] + swriter = io.StringIO() + ewriter = io.StringIO() + + main.process_movie_reviews(service, reader, swriter, ewriter) + + sentiments = swriter.getvalue().strip().split('\n') + entities = ewriter.getvalue().strip().split('\n') + + sentiments = [json.loads(sentiment) for sentiment in sentiments] + entities = [json.loads(entity) for entity in entities] + + # assert sentiments + assert sentiments[0].get('sentiment') == 1.0 + assert sentiments[0].get('label') == 1 + + assert sentiments[1].get('sentiment') == 1.0 + assert sentiments[1].get('label') == 1 + + # assert entities + assert len(entities) == 1 + assert entities[0].get('name') == 'Tom Cruise' + assert (entities[0].get('wiki_url') == + 'http://en.wikipedia.org/wiki/Tom_Cruise') + assert entities[0].get('sentiment') == 2.0 diff --git a/language/movie_nl/requirements.txt b/language/movie_nl/requirements.txt new file mode 100644 index 00000000000..391be2e9843 --- /dev/null +++ b/language/movie_nl/requirements.txt @@ -0,0 +1,3 @@ +urlparse2==1.1.1 +google-api-python-client==1.5.1 +requests==2.10.0 diff --git a/language/ocr_nl/README.md b/language/ocr_nl/README.md new file mode 100644 index 00000000000..189e9397901 --- /dev/null +++ b/language/ocr_nl/README.md @@ -0,0 +1,227 @@ + +# Using the Cloud Natural Language API to analyze image text found with Cloud Vision + +This example uses the [Cloud Vision API](https://cloud.google.com/vision/) to +detect text in images, then analyzes that text using the [Cloud NL (Natural +Language) API](https://cloud.google.com/natural-language/) to detect +[entities](https://cloud.google.com/natural-language/docs/basics#entity_analysis) +in the text. It stores the detected entity +information in an [sqlite3](https://www.sqlite.org) database, which may then be +queried. + +(This kind of analysis can be useful with scans of brochures and fliers, +invoices, and other types of company documents... or maybe just organizing your +memes). + +After the example script has analyzed a directory of images, it outputs some +information on the images' entities to STDOUT. You can also further query +the generated sqlite3 database. + +## Setup + +### Install sqlite3 as necessary + +The example requires that sqlite3 be installed. Most likely, sqlite3 is already +installed for you on your machine, but if not, you can find it +[here](https://www.sqlite.org/download.html). + +### Set Up to Authenticate With Your Project's Credentials + +* Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) +steps in the Quickstart doc to create a project and enable the +Cloud Natural Language API. +* Following those steps, make sure that you [Set Up a Service + Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), + and export the following environment variable: + + ``` + export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json + ``` +* This sample also requires that you [enable the Cloud Vision + API](https://console.cloud.google.com/apis/api/vision.googleapis.com/overview?project=_) + +## Running the example + +Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. + +To run the example, install the necessary libraries using pip: + +```sh +$ pip install -r requirements.txt +``` + +You must also be set up to authenticate with the Cloud APIs using your +project's service account credentials, as described above. + +Then, run the script on a directory of images to do the analysis, E.g.: + +```sh +$ python main.py --input_directory= +``` + +You can try this on a sample directory of images: + +```sh +$ curl -O http://storage.googleapis.com/python-docs-samples-tests/language/ocr_nl-images.zip +$ unzip ocr_nl-images.zip +$ python main.py --input_directory=images/ +``` + +## A walkthrough of the example and its results + +Let's take a look at what the example generates when run on the `images/` +sample directory, and how it does it. + +The script looks at each image file in the given directory, and uses the Vision +API's text detection capabilities (OCR) to find any text in each image. It +passes that info to the NL API, and asks it to detect [entities](xxx) in the +discovered text, then stores this information in a queryable database. + +To keep things simple, we're just passing to the NL API all the text found in a +given image, in one string. Note that sometimes this string can include +misinterpreted characters (if the image text was not very clear), or list words +"out of order" from how a human would interpret them. So, the text that is +actually passed to the NL API might not be quite what you would have predicted +with your human eyeballs. + +The Entity information returned by the NL API includes *type*, *name*, *salience*, +information about where in the text the given entity was found, and detected +language. It may also include *metadata*, including a link to a Wikipedia URL +that the NL API believes this entity maps to. See the +[documentation](https://cloud.google.com/natural-language/docs/) and the [API +reference pages](https://cloud.google.com/natural-language/reference/rest/v1beta1/Entity) +for more information about `Entity` fields. + +For example, if the NL API was given the sentence: + +``` +"Holmes and Watson walked over to the cafe." +``` + +it would return a response something like the following: + +``` +{ + "entities": [{ + "salience": 0.51629782, + "mentions": [{ + "text": { + "content": "Holmes", + "beginOffset": 0 + }}], + "type": "PERSON", + "name": "Holmes", + "metadata": { + "wikipedia_url": "http://en.wikipedia.org/wiki/Sherlock_Holmes" + }}, + { + "salience": 0.22334209, + "mentions": [{ + "text": { + "content": "Watson", + "beginOffset": 11 + }}], + "type": "PERSON", + "name": "Watson", + "metadata": { + "wikipedia_url": "http://en.wikipedia.org/wiki/Dr._Watson" + }}], + "language": "en" +} +``` + +Note that the NL API determined from context that "Holmes" was referring to +'Sherlock Holmes', even though the name "Sherlock" was not included. + +Note also that not all nouns in a given sentence are detected as Entities. An +Entity represents a phrase in the text that is a known entity, such as a person, +an organization, or location. The generic mention of a 'cafe' is not treated as +an entity in this sense. + +For each image file, we store its detected entity information (if any) in an +sqlite3 database. + +### Querying for information about the detected entities + +Once the detected entity information from all the images is stored in the +sqlite3 database, we can run some queries to do some interesting analysis. The +script runs a couple of such example query sets and outputs the result to STDOUT. + +The first set of queries outputs information about the top 15 most frequent +entity names found in the images, and the second outputs information about the +top 15 most frequent Wikipedia URLs found. + +For example, with the sample image set, note that the name 'Sherlock Holmes' is +found three times, but entities associated with the URL +http://en.wikipedia.org/wiki/Sherlock_Holmes are found four times; one of the +entity names was only "Holmes", but the NL API detected from context that it +referred to Sherlock Holmes. Similarly, you can see that mentions of 'Hive' and +'Spark' mapped correctly – given their context – to the URLs of those Apache +products. + +``` +----entity: http://en.wikipedia.org/wiki/Apache_Hive was found with count 1 +Found in file images/IMG_20160621_133020.jpg, detected as type OTHER, with + locale en. +names(s): set([u'hive']) +salience measure(s): set([0.0023808887]) +``` + +Similarly, 'Elizabeth' (in screencaps of text from "Pride and Prejudice") is +correctly mapped to http://en.wikipedia.org/wiki/Elizabeth_Bennet because of the +context of the surrounding text. + +``` +----entity: http://en.wikipedia.org/wiki/Elizabeth_Bennet was found with count 2 +Found in file images/Screenshot 2016-06-19 11.51.50.png, detected as type PERSON, with + locale en. +Found in file images/Screenshot 2016-06-19 12.08.30.png, detected as type PERSON, with + locale en. +names(s): set([u'elizabeth']) +salience measure(s): set([0.34601286, 0.0016268975]) +``` + +## Further queries to the sqlite3 database + +When the script runs, it makes a couple of example queries to the database +containing the entity information returned from the NL API. You can make further +queries on that database by starting up sqlite3 from the command line, and +passing it the name of the database file generated by running the example. This +file will be in the same directory, and have `entities` as a prefix, with the +timestamp appended. (If you have run the example more than once, a new database +file will be created each time). + +Run sqlite3 as follows (using the name of your own database file): + +```sh +$ sqlite3 entities1466518508.db +``` + +You'll see something like this: + +``` +SQLite version 3.8.10.2 2015-05-20 18:17:19 +Enter ".help" for usage hints. +sqlite> +``` + +From this prompt, you can make any queries on the data that you want. E.g., +start with something like: + +``` +sqlite> select * from entities limit 20; +``` + +Or, try this to see in which images the most entities were detected: + +``` +sqlite> select filename, count(filename) from entities group by filename; +``` + +You can do more complex queries to get further information about the entities +that have been discovered in your images. E.g., you might want to investigate +which of the entities are most commonly found together in the same image. See +the [SQLite documentation](https://www.sqlite.org/docs.html) for more +information. + + diff --git a/language/ocr_nl/main.py b/language/ocr_nl/main.py new file mode 100755 index 00000000000..6e329f53386 --- /dev/null +++ b/language/ocr_nl/main.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This example uses the Google Cloud Vision API to detect text in images, then +analyzes that text using the Google Cloud Natural Language API to detect +entities in the text. It stores the detected entity information in an sqlite3 +database, which may then be queried. + +After this script has analyzed a directory of images, it outputs some +information on the images' entities to STDOUT. You can also further query +the generated sqlite3 database; see the README for more information. + +Run the script on a directory of images to do the analysis, E.g.: + $ python main.py --input_directory= + +You can try this on a sample directory of images: + $ curl -O http://storage.googleapis.com/python-docs-samples-tests/language/ocr_nl-images.zip + $ unzip ocr_nl-images.zip + $ python main.py --input_directory=images/ + +""" # noqa + +import argparse +import base64 +import contextlib +import logging +import os +import sqlite3 +import sys +import time + +from googleapiclient import discovery +from googleapiclient import errors +import httplib2 +from oauth2client.client import GoogleCredentials + +BATCH_SIZE = 10 + + +class VisionApi(object): + """Construct and use the Cloud Vision API service.""" + + def __init__(self): + credentials = GoogleCredentials.get_application_default() + self.service = discovery.build('vision', 'v1', credentials=credentials) + + def detect_text(self, input_filenames, num_retries=3, max_results=6): + """Uses the Vision API to detect text in the given file.""" + batch_request = [] + for filename in input_filenames: + request = { + 'image': {}, + 'features': [{ + 'type': 'TEXT_DETECTION', + 'maxResults': max_results, + }] + } + + # Accept both files in cloud storage, as well as local files. + if filename.startswith('gs://'): + request['image']['source'] = { + 'gcsImageUri': filename + } + else: + with open(filename, 'rb') as image_file: + request['image']['content'] = base64.b64encode( + image_file.read()).decode('UTF-8') + + batch_request.append(request) + + request = self.service.images().annotate( + body={'requests': batch_request}) + + try: + responses = request.execute(num_retries=num_retries) + if 'responses' not in responses: + return {} + + text_response = {} + for filename, response in zip( + input_filenames, responses['responses']): + + if 'error' in response: + logging.error('API Error for {}: {}'.format( + filename, + response['error'].get('message', ''))) + continue + + text_response[filename] = response.get('textAnnotations', []) + + return text_response + + except errors.HttpError as e: + logging.error('Http Error for {}: {}'.format(filename, e)) + except KeyError as e2: + logging.error('Key error: {}'.format(e2)) + + +class TextAnalyzer(object): + """Construct and use the Google Natural Language API service.""" + + def __init__(self, db_filename=None): + credentials = GoogleCredentials.get_application_default() + scoped_credentials = credentials.create_scoped( + ['https://www.googleapis.com/auth/cloud-platform']) + http = httplib2.Http() + scoped_credentials.authorize(http) + self.service = discovery.build('language', 'v1beta1', http=http) + + # This list will store the entity information gleaned from the + # image files. + self.entity_info = [] + + # This is the filename of the sqlite3 database to save to + self.db_filename = db_filename or 'entities{}.db'.format( + int(time.time())) + + def _get_native_encoding_type(self): + """Returns the encoding type that matches Python's native strings.""" + if sys.maxunicode == 65535: + return 'UTF16' + else: + return 'UTF32' + + def nl_detect(self, text): + """Use the Natural Language API to analyze the given text string.""" + # We're only requesting 'entity' information from the Natural Language + # API at this time. + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'encodingType': self._get_native_encoding_type(), + } + entities = [] + try: + request = self.service.documents().analyzeEntities(body=body) + response = request.execute() + entities = response['entities'] + except errors.HttpError as e: + logging.error('Http Error: %s' % e) + except KeyError as e2: + logging.error('Key error: %s' % e2) + return entities + + def add_entities(self, filename, locale, document): + """Apply the Natural Language API to the document, and collect the + detected entities.""" + + # Apply the Natural Language API to the document. + entities = self.nl_detect(document) + self.extract_and_save_entity_info(entities, locale, filename) + + def extract_entity_info(self, entity): + """Extract information about an entity.""" + type = entity['type'] + name = entity['name'].lower() + metadata = entity['metadata'] + salience = entity['salience'] + wiki_url = metadata.get('wikipedia_url', None) + return (type, name, salience, wiki_url) + + def extract_and_save_entity_info(self, entities, locale, filename): + for entity in entities: + type, name, salience, wiki_url = self.extract_entity_info(entity) + # Because this is a small example, we're using a list to hold + # all the entity information, then we'll insert it into the + # database all at once when we've processed all the files. + # For a larger data set, you would want to write to the database + # in batches. + self.entity_info.append( + (locale, type, name, salience, wiki_url, filename)) + + def write_entity_info_to_db(self): + """Store the info gleaned about the entities in the text, via the + Natural Language API, in an sqlite3 database table, and then print out + some simple analytics. + """ + logging.info('Saving entity info to the sqlite3 database.') + # Create the db. + with contextlib.closing(sqlite3.connect(self.db_filename)) as conn: + with conn as cursor: + # Create table + cursor.execute( + 'CREATE TABLE if not exists entities (locale text, ' + 'type text, name text, salience real, wiki_url text, ' + 'filename text)') + with conn as cursor: + # Load all the data + cursor.executemany( + 'INSERT INTO entities VALUES (?,?,?,?,?,?)', + self.entity_info) + + def output_entity_data(self): + """Output some info about the entities by querying the generated + sqlite3 database. + """ + + with contextlib.closing(sqlite3.connect(self.db_filename)) as conn: + + # This query finds the number of times each entity name was + # detected, in descending order by count, and returns information + # about the first 15 names, including the files in which they were + # found, their detected 'salience' and language (locale), and the + # wikipedia urls (if any) associated with them. + print('\n==============\nTop 15 most frequent entity names:') + + cursor = conn.cursor() + results = cursor.execute( + 'select name, count(name) as wc from entities ' + 'group by name order by wc desc limit 15;') + + for item in results: + cursor2 = conn.cursor() + print(u'\n----Name: {} was found with count {}'.format(*item)) + results2 = cursor2.execute( + 'SELECT name, type, filename, locale, wiki_url, salience ' + 'FROM entities WHERE name=?', (item[0],)) + urls = set() + for elt in results2: + print(('Found in file {}, detected as type {}, with\n' + ' locale {} and salience {}.').format( + elt[2], elt[1], elt[3], elt[5])) + if elt[4]: + urls.add(elt[4]) + if urls: + print('url(s): {}'.format(urls)) + + # This query finds the number of times each wikipedia url was + # detected, in descending order by count, and returns information + # about the first 15 urls, including the files in which they were + # found and the names and 'salience' with which they were + # associated. + print('\n==============\nTop 15 most frequent Wikipedia URLs:') + c = conn.cursor() + results = c.execute( + 'select wiki_url, count(wiki_url) as wc from entities ' + 'group by wiki_url order by wc desc limit 15;') + + for item in results: + cursor2 = conn.cursor() + print('\n----entity: {} was found with count {}'.format(*item)) + results2 = cursor2.execute( + 'SELECT name, type, filename, locale, salience ' + 'FROM entities WHERE wiki_url=?', (item[0],)) + names = set() + salience = set() + for elt in results2: + print(('Found in file {}, detected as type {}, with\n' + ' locale {}.').format(elt[2], elt[1], elt[3])) + names.add(elt[0]) + salience.add(elt[4]) + print('names(s): {}'.format(names)) + print('salience measure(s): {}'.format(salience)) + + +def extract_description(texts): + """Returns text annotations as a single string""" + document = [] + + for text in texts: + try: + document.append(text['description']) + locale = text['locale'] + # Process only the first entry, which contains all + # text detected. + break + except KeyError as e: + logging.error('KeyError: %s\n%s' % (e, text)) + return (locale, ' '.join(document)) + + +def extract_descriptions(input_filename, texts, text_analyzer): + """Gets the text that was detected in the image.""" + if texts: + locale, document = extract_description(texts) + text_analyzer.add_entities(input_filename, locale, document) + sys.stdout.write('.') # Output a progress indicator. + sys.stdout.flush() + elif texts == []: + print('%s had no discernible text.' % input_filename) + + +def get_text_from_files(vision, input_filenames, text_analyzer): + """Call the Vision API on a file and index the results.""" + texts = vision.detect_text(input_filenames) + if texts: + for filename, text in texts.items(): + extract_descriptions(filename, text, text_analyzer) + + +def batch(list_to_batch, batch_size=BATCH_SIZE): + """Group a list into batches of size batch_size. + + >>> tuple(batch([1, 2, 3, 4, 5], batch_size=2)) + ((1, 2), (3, 4), (5)) + """ + for i in range(0, len(list_to_batch), batch_size): + yield tuple(list_to_batch[i:i + batch_size]) + + +def main(input_dir, db_filename=None): + """Walk through all the image files in the given directory, extracting any + text from them and feeding that text to the Natural Language API for + analysis. + """ + # Create a client object for the Vision API + vision_api_client = VisionApi() + # Create an object to analyze our text using the Natural Language API + text_analyzer = TextAnalyzer(db_filename) + + if input_dir: + allfileslist = [] + # Recursively construct a list of all the files in the given input + # directory. + for folder, subs, files in os.walk(input_dir): + for filename in files: + allfileslist.append(os.path.join(folder, filename)) + + # Analyze the text in the files using the Vision and Natural Language + # APIs. + for filenames in batch(allfileslist, batch_size=1): + get_text_from_files(vision_api_client, filenames, text_analyzer) + + # Save the result to a database, then run some queries on the database, + # with output to STDOUT. + text_analyzer.write_entity_info_to_db() + + # now, print some information about the entities detected. + text_analyzer.output_entity_data() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Detects text in the images in the given directory.') + parser.add_argument( + '--input_directory', + help='The image directory you\'d like to detect text in. If left ' + 'unspecified, the --db specified will be queried without being ' + 'updated.') + parser.add_argument( + '--db', help='The filename to use for the sqlite3 database.') + args = parser.parse_args() + + if not (args.input_directory or args.db): + parser.error('Either --input_directory or --db must be specified.') + + main(args.input_directory, args.db) diff --git a/language/ocr_nl/main_test.py b/language/ocr_nl/main_test.py new file mode 100755 index 00000000000..c07ed747ea0 --- /dev/null +++ b/language/ocr_nl/main_test.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for main.""" + +import re +import zipfile + +import main + + +_TEST_IMAGE_URI = 'gs://{}/language/image8.png' + + +def test_batch_empty(): + for batch_size in range(1, 10): + assert len( + list(main.batch([], batch_size=batch_size))) == 0 + + +def test_batch_single(): + for batch_size in range(1, 10): + batched = tuple(main.batch([1], batch_size=batch_size)) + assert batched == ((1,),) + + +def test_single_image_returns_text(cloud_config): + vision_api_client = main.VisionApi() + + image_path = _TEST_IMAGE_URI.format(cloud_config.storage_bucket) + texts = vision_api_client.detect_text([image_path]) + + assert image_path in texts + _, document = main.extract_description(texts[image_path]) + assert "daughter" in document + assert "Bennet" in document + assert "hat" in document + + +def test_single_nonimage_returns_error(): + vision_api_client = main.VisionApi() + texts = vision_api_client.detect_text(['README.md']) + assert "README.md" not in texts + + +def test_text_returns_entities(): + text = "Holmes and Watson walked to the cafe." + text_analyzer = main.TextAnalyzer() + entities = text_analyzer.nl_detect(text) + assert len(entities) == 2 + etype, ename, salience, wurl = text_analyzer.extract_entity_info( + entities[0]) + assert ename == 'holmes' + assert wurl == 'http://en.wikipedia.org/wiki/Sherlock_Holmes' + + +def test_entities_list(cloud_config): + vision_api_client = main.VisionApi() + image_path = _TEST_IMAGE_URI.format(cloud_config.storage_bucket) + texts = vision_api_client.detect_text([image_path]) + locale, document = main.extract_description(texts[image_path]) + text_analyzer = main.TextAnalyzer() + entities = text_analyzer.nl_detect(document) + assert len(entities) == 4 + etype, ename, salience, wurl = text_analyzer.extract_entity_info( + entities[0]) + assert ename == 'bennet' + assert wurl == 'http://en.wikipedia.org/wiki/Mr_Bennet' + + +def test_main(remote_resource, tmpdir, capsys): + images_path = str(tmpdir.mkdir('images')) + + # First, pull down some test data + zip_path = remote_resource('language/ocr_nl-images-small.zip', tmpdir) + + # Extract it to the image directory + with zipfile.ZipFile(zip_path) as zfile: + zfile.extractall(images_path) + + main.main(images_path, str(tmpdir.join('ocr_nl.db'))) + + stdout, _ = capsys.readouterr() + + assert re.search(r'google was found with count', stdout) diff --git a/language/ocr_nl/requirements.txt b/language/ocr_nl/requirements.txt new file mode 100644 index 00000000000..0b96c82ee4c --- /dev/null +++ b/language/ocr_nl/requirements.txt @@ -0,0 +1 @@ +google-api-python-client==1.5.1 diff --git a/language/syntax_triples/README.md b/language/syntax_triples/README.md new file mode 100644 index 00000000000..1342ee65289 --- /dev/null +++ b/language/syntax_triples/README.md @@ -0,0 +1,91 @@ +# Using the Cloud Natural Language API to find subject-verb-object triples in text + +This example finds subject-verb-object triples in a given piece of text using +syntax analysis capabilities of +[Cloud Natural Language API](https://cloud.google.com/natural-language/). +To do this, it calls the extractSyntax feature of the API +and uses the dependency parse tree and part-of-speech tags in the resposne +to build the subject-verb-object triples. The results are printed to STDOUT. +This type of analysis can be considered as the +first step towards an information extraction task. + +## Set Up to Authenticate With Your Project's Credentials + +Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) +steps in the Quickstart doc to create a project and enable the +Cloud Natural Language API. Following those steps, make sure that you +[Set Up a Service Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), +and export the following environment variable: + +``` +export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json +``` + +## Running the example + +Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. + +To run the example, install the necessary libraries using pip: + +``` +$ pip install -r requirements.txt +``` +You must also be set up to authenticate with the Cloud APIs using your +project's service account credentials, as described above. + +Then, run the script on a file containing the text that you wish to analyze. +The text must be encoded in UTF8 or ASCII: + +``` +$ python main.py +``` + +Try this on a sample text in the resources directory: + +``` +$ python main.py resources/obama_wikipedia.txt +``` + +## A walkthrough of the example and its results + +Let's take a look at what the example generates when run on the +`obama_wikipedia.txt` sample file, and how it does it. + +The goal is to find all subject-verb-object +triples in the text. The example first sends the text to the Cloud Natural +Language API to perform extractSyntax analysis. Then, using part-of-speech tags, + it finds all the verbs in the text. For each verb, it uses the dependency +parse tree information to find all the dependent tokens. + +For example, given the following sentence in the `obama_wikipedia.txt` file: + +``` +"He began his presidential campaign in 2007" +``` +The example finds the verb `began`, and `He`, `campaign`, and `in` as its +dependencies. Then the script enumerates the dependencies for each verb and +finds all the subjects and objects. For the sentence above, the found subject +and object are `He` and `campaign`. + +The next step is to complete each subject and object token by adding their +dependencies to them. For example, in the sentence above, `his` and +`presidential` are dependent tokens for `campaign`. This is done using the +dependency parse tree, similar to verb dependencies as explained above. The +final result is (`He`, `began`, `his presidential campaign`) triple for +the example sentence above. + +The script performs this analysis for the entire text and prints the result. +For the `obama_wikipedia.txt` file, the result is the following: + +```sh ++------------------------------+------------+------------------------------+ +| Obama | received | national attention | ++------------------------------+------------+------------------------------+ +| He | began | his presidential campaign | ++------------------------------+------------+------------------------------+ +| he | won | sufficient delegates in the | +| | | Democratic Party primaries | ++------------------------------+------------+------------------------------+ +| He | defeated | Republican nominee John | +| | | McCain | +``` diff --git a/language/syntax_triples/main.py b/language/syntax_triples/main.py new file mode 100644 index 00000000000..1be174bff04 --- /dev/null +++ b/language/syntax_triples/main.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This example finds subject-verb-object triples in a given piece of text using +the syntax analysis capabilities of Cloud Natural Language API. The triples are +printed to STDOUT. This can be considered as the first step towards an +information extraction task. + +Run the script on a file containing the text that you wish to analyze. +The text must be encoded in UTF8 or ASCII: + $ python main.py + +Try this on a sample text in the resources directory: + $ python main.py resources/obama_wikipedia.txt +""" + +import argparse +import sys +import textwrap + +from googleapiclient import discovery +import httplib2 +from oauth2client.client import GoogleCredentials + + +def dependents(tokens, head_index): + """Returns an ordered list of the token indices of the dependents for + the given head.""" + # Create head->dependency index. + head_to_deps = {} + for i, token in enumerate(tokens): + head = token['dependencyEdge']['headTokenIndex'] + if i != head: + head_to_deps.setdefault(head, []).append(i) + return head_to_deps.get(head_index, ()) + + +def phrase_text_for_head(tokens, text, head_index): + """Returns the entire phrase containing the head token + and its dependents. + """ + begin, end = phrase_extent_for_head(tokens, head_index) + return text[begin:end] + + +def phrase_extent_for_head(tokens, head_index): + """Returns the begin and end offsets for the entire phrase + containing the head token and its dependents. + """ + begin = tokens[head_index]['text']['beginOffset'] + end = begin + len(tokens[head_index]['text']['content']) + for child in dependents(tokens, head_index): + child_begin, child_end = phrase_extent_for_head(tokens, child) + begin = min(begin, child_begin) + end = max(end, child_end) + return (begin, end) + + +def analyze_syntax(text): + """Use the NL API to analyze the given text string, and returns the + response from the API. Requests an encodingType that matches + the encoding used natively by Python. Raises an + errors.HTTPError if there is a connection problem. + """ + credentials = GoogleCredentials.get_application_default() + scoped_credentials = credentials.create_scoped( + ['https://www.googleapis.com/auth/cloud-platform']) + http = httplib2.Http() + scoped_credentials.authorize(http) + service = discovery.build( + 'language', 'v1beta1', http=http) + body = { + 'document': { + 'type': 'PLAIN_TEXT', + 'content': text, + }, + 'features': { + 'extract_syntax': True, + }, + 'encodingType': get_native_encoding_type(), + } + request = service.documents().annotateText(body=body) + return request.execute() + + +def get_native_encoding_type(): + """Returns the encoding type that matches Python's native strings.""" + if sys.maxunicode == 65535: + return 'UTF16' + else: + return 'UTF32' + + +def find_triples(tokens, + left_dependency_label='NSUBJ', + head_part_of_speech='VERB', + right_dependency_label='DOBJ'): + """Generator function that searches the given tokens + with the given part of speech tag, that have dependencies + with the given labels. For each such head found, yields a tuple + (left_dependent, head, right_dependent), where each element of the + tuple is an index into the tokens array. + """ + for head, token in enumerate(tokens): + if token['partOfSpeech']['tag'] == head_part_of_speech: + children = dependents(tokens, head) + left_deps = [] + right_deps = [] + for child in children: + child_token = tokens[child] + child_dep_label = child_token['dependencyEdge']['label'] + if child_dep_label == left_dependency_label: + left_deps.append(child) + elif child_dep_label == right_dependency_label: + right_deps.append(child) + for left_dep in left_deps: + for right_dep in right_deps: + yield (left_dep, head, right_dep) + + +def show_triple(tokens, text, triple): + """Prints the given triple (left, head, right). For left and right, + the entire phrase headed by each token is shown. For head, only + the head token itself is shown. + + """ + nsubj, verb, dobj = triple + + # Extract the text for each element of the triple. + nsubj_text = phrase_text_for_head(tokens, text, nsubj) + verb_text = tokens[verb]['text']['content'] + dobj_text = phrase_text_for_head(tokens, text, dobj) + + # Pretty-print the triple. + left = textwrap.wrap(nsubj_text, width=28) + mid = textwrap.wrap(verb_text, width=10) + right = textwrap.wrap(dobj_text, width=28) + print('+' + 30 * '-' + '+' + 12 * '-' + '+' + 30 * '-' + '+') + for l, m, r in zip(left, mid, right): + print('| {:<28s} | {:<10s} | {:<28s} |'.format( + l or '', m or '', r or '')) + + +def main(text_file): + # Extracts subject-verb-object triples from the given text file, + # and print each one. + + # Read the input file. + text = open(text_file, 'rb').read().decode('utf8') + + analysis = analyze_syntax(text) + tokens = analysis.get('tokens', []) + + for triple in find_triples(tokens): + show_triple(tokens, text, triple) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + 'text_file', + help='A file containing the document to process. ' + 'Should be encoded in UTF8 or ASCII') + args = parser.parse_args() + main(args.text_file) diff --git a/language/syntax_triples/main_test.py b/language/syntax_triples/main_test.py new file mode 100755 index 00000000000..62c2915da02 --- /dev/null +++ b/language/syntax_triples/main_test.py @@ -0,0 +1,50 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import main + + +def test_dependents(): + text = "I am eating a delicious banana" + analysis = main.analyze_syntax(text) + tokens = analysis.get('tokens', []) + assert [0, 1, 5] == main.dependents(tokens, 2) + assert [3, 4] == main.dependents(tokens, 5) + + +def test_phrase_text_for_head(): + text = "A small collection of words" + analysis = main.analyze_syntax(text) + tokens = analysis.get('tokens', []) + assert "words" == main.phrase_text_for_head(tokens, text, 4) + + +def test_find_triples(): + text = "President Obama won the noble prize" + analysis = main.analyze_syntax(text) + tokens = analysis.get('tokens', []) + triples = main.find_triples(tokens) + for triple in triples: + assert (1, 2, 5) == triple + + +def test_obama_example(resource, capsys): + main.main(resource('obama_wikipedia.txt')) + stdout, _ = capsys.readouterr() + lines = stdout.split('\n') + assert re.match( + r'.*Obama\b.*\| received\b.*\| national attention\b', + lines[1]) diff --git a/language/syntax_triples/requirements.txt b/language/syntax_triples/requirements.txt new file mode 100644 index 00000000000..0b96c82ee4c --- /dev/null +++ b/language/syntax_triples/requirements.txt @@ -0,0 +1 @@ +google-api-python-client==1.5.1 diff --git a/language/syntax_triples/resources/obama_wikipedia.txt b/language/syntax_triples/resources/obama_wikipedia.txt new file mode 100644 index 00000000000..1e89d4ab081 --- /dev/null +++ b/language/syntax_triples/resources/obama_wikipedia.txt @@ -0,0 +1 @@ +In 2004, Obama received national attention during his campaign to represent Illinois in the United States Senate with his victory in the March Democratic Party primary, his keynote address at the Democratic National Convention in July, and his election to the Senate in November. He began his presidential campaign in 2007 and, after a close primary campaign against Hillary Clinton in 2008, he won sufficient delegates in the Democratic Party primaries to receive the presidential nomination. He then defeated Republican nominee John McCain in the general election, and was inaugurated as president on January 20, 2009. Nine months after his inauguration, Obama was named the 2009 Nobel Peace Prize laureate. From aff93ad794f89e1bf14b095a5ac372271ae2c24c Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Tue, 19 Jul 2016 18:48:17 -0700 Subject: [PATCH 02/10] fixed variable name error --- language/movie_nl/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 380f495f967..9281e1d6bee 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -164,8 +164,8 @@ def get_sentiment_entities(service, document): positive_sentiments = [ polarity for polarity, magnitude in sentiments if polarity > 0.0] - negative = sum(negative_sentiment) - positive = sum(positive_sentiment) + negative = sum(negative_sentiments) + positive = sum(positive_sentiments) total = positive + negative return (total, entities) From 03cb67be40be347998e5b199ebbe6fc1f4886d7c Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Tue, 19 Jul 2016 19:06:34 -0700 Subject: [PATCH 03/10] logged error message with exception Change-Id: I8ff59d08f2ae8ce4cb3dd0b57b3548db3d5b8add --- language/movie_nl/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 9281e1d6bee..066818f4de6 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -208,8 +208,8 @@ def process_movie_reviews(service, reader, sentiment_writer, entity_writer): collected_entities[ent] = (ent_sent, frequency) - except Exception: - logging.info('Skipping {}'.format(document.doc_id)) + except Exception as e: + logging.exception('Skipping {}'.format(document.doc_id)) for entity, e_tuple in collected_entities.items(): entity_writer.write(to_entity_json(entity, e_tuple)) From e2cc3d548932877d7469ad3a6f0c16eb9e4344a8 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Tue, 19 Jul 2016 19:10:11 -0700 Subject: [PATCH 04/10] fixed variable unused error Change-Id: I1367429c3ceaefd0b0d36d9d7cad6ca29cbfdd2d --- language/movie_nl/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 066818f4de6..d1142b4f1ea 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -208,7 +208,7 @@ def process_movie_reviews(service, reader, sentiment_writer, entity_writer): collected_entities[ent] = (ent_sent, frequency) - except Exception as e: + except Exception: logging.exception('Skipping {}'.format(document.doc_id)) for entity, e_tuple in collected_entities.items(): From 17896774eaf49d2ce926dd2a3f9b16c01f215c74 Mon Sep 17 00:00:00 2001 From: Jerjou Cheng Date: Tue, 19 Jul 2016 19:20:05 -0700 Subject: [PATCH 05/10] Refactor for clarity. Change-Id: Iecc1327db4aa21eb5ee61ce58bba3d142e600734 --- language/movie_nl/README.md | 6 +- language/movie_nl/main.py | 127 ++++++++++++++++-------------------- 2 files changed, 58 insertions(+), 75 deletions(-) diff --git a/language/movie_nl/README.md b/language/movie_nl/README.md index b651dee8bb7..dab79d20c6d 100644 --- a/language/movie_nl/README.md +++ b/language/movie_nl/README.md @@ -110,7 +110,7 @@ $ pip install -r requirements.txt ### How to Run ``` -$ python main.py --inp "tokens/*/*" \ +$ python main.py analyze --inp "tokens/*/*" \ --sout sentiment.json \ --eout entity.json \ --sample 5 @@ -145,9 +145,7 @@ In order to sort and rank the entities generated, use the same `main.py` script. this will print the top 5 actors with negative sentiment: ``` -$ python main.py --inp entity.json \ - --sout sentiment.json \ - --eout entity.json \ +$ python main.py rank entity.json \ --sentiment neg \ --reverse True \ --sample 5 diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index d1142b4f1ea..9d6380738c4 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -186,30 +186,26 @@ def process_movie_reviews(service, reader, sentiment_writer, entity_writer): collected_entities = {} for document in reader: - try: - sentiment_total, entities = get_sentiment_entities( - service, document) - document.label = get_sentiment_label(sentiment_total) - - sentiment_writer.write( - to_sentiment_json( - document.doc_id, - sentiment_total, - document.label - ) + sentiment_total, entities = get_sentiment_entities( + service, document) + document.label = get_sentiment_label(sentiment_total) + + sentiment_writer.write( + to_sentiment_json( + document.doc_id, + sentiment_total, + document.label ) + ) - sentiment_writer.write('\n') + sentiment_writer.write('\n') - for ent in entities: - ent_sent, frequency = collected_entities.get(ent, (0, 0)) - ent_sent += sentiment_total - frequency += 1 + for ent in entities: + ent_sent, frequency = collected_entities.get(ent, (0, 0)) + ent_sent += sentiment_total + frequency += 1 - collected_entities[ent] = (ent_sent, frequency) - - except Exception: - logging.exception('Skipping {}'.format(document.doc_id)) + collected_entities[ent] = (ent_sent, frequency) for entity, e_tuple in collected_entities.items(): entity_writer.write(to_entity_json(entity, e_tuple)) @@ -231,7 +227,7 @@ def document_generator(dir_path_pattern, count=None): try: text = f.read() except UnicodeDecodeError: - text = None + continue yield Document(text, doc_id, item) @@ -277,70 +273,59 @@ def get_service(): return discovery.build('language', 'v1beta1', http=http) -def main(input_dir, sent_out, ent_out, sample, log_file, - operation, sentiment, ent_in, reverse_bool): +def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): """Movie demo main program""" - sample = int(sample) if sample else None - - if operation == 'rank': - with open(ent_in) as reader: - rank_entities(reader, sentiment, sample, reverse_bool) - else: - # Create logger settings - logging.basicConfig(filename=log_file, level=logging.DEBUG) - - # Create a Google Service object - service = get_service() - - # Create a sentiment output writer - sentiment_writer = open(sent_out, 'w') + # Create logger settings + logging.basicConfig(filename=log_file, level=logging.DEBUG) - # Create an entity output writer - entity_writer = open(ent_out, 'w') + # Create a Google Service object + service = get_service() - reader = document_generator(input_dir, sample) + reader = document_generator(input_dir, sample) - # Process the movie documents - process_movie_reviews(service, reader, sentiment_writer, entity_writer) + # Process the movie documents + process_movie_reviews(service, reader, sentiment_writer, entity_writer) - # close reader and writers - sentiment_writer.close() - entity_writer.close() - reader.close() + # close reader and writers + sentiment_writer.close() + entity_writer.close() + reader.close() if __name__ == '__main__': parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('--inp', help='location of the input', required=True) - parser.add_argument( - '--sout', help='location of the sentiment output', required=True) - parser.add_argument( - '--eout', help='location of the entity output', required=True) - parser.add_argument('--sample', help='number of top items to process') - parser.add_argument( - '--op', - help='operation to perform "rank" or "analyze"', - default='analyze') - parser.add_argument( + + subparsers = parser.add_subparsers(dest='command') + + rank_parser = subparsers.add_parser('rank') + + rank_parser.add_argument( + 'entity_input', help='location of entity input', type=argparse.FileType('r')) + rank_parser.add_argument( '--sentiment', help='filter sentiment as "neg" or "pos"') - parser.add_argument( - '--ein', help='location of entity input') - parser.add_argument( + rank_parser.add_argument( '--reverse', help='reverse the order of the items') + rank_parser.add_argument('--sample', help='number of top items to process') + + analyze_parser = subparsers.add_parser('analyze') + + analyze_parser.add_argument( + '--inp', help='location of the input', required=True) + analyze_parser.add_argument( + '--sout', help='location of the sentiment output', required=True, + type=argparse.FileType('w')) + analyze_parser.add_argument( + '--eout', help='location of the entity output', required=True, + type=argparse.FileType('w')) + analyze_parser.add_argument('--sample', help='number of top items to process') + analyze_parser.add_argument('--log_file', default='movie.log') args = parser.parse_args() - log_file = 'movie.log' - - main(args.inp, - args.sout, - args.eout, - args.sample, - log_file, - args.op, - args.sentiment, - args.ein, - args.reverse) + if args.command == 'analyze': + analyze(args.inp, args.sout, args.eout, args.sample, args.log_file) + elif args.command == 'rank': + rank_entities(args.entity_input, args.sentiment, args.sample, args.reverse) From 1242524956ffbcede1e824c3b7e2ef2971f0d223 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Tue, 19 Jul 2016 19:47:11 -0700 Subject: [PATCH 06/10] cast samples to int Change-Id: I33a6c3259dc5a9b0cc1ad081cd57acc452fc6b1a --- language/movie_nl/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 9d6380738c4..bebdd59c3fd 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -326,6 +326,6 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): args = parser.parse_args() if args.command == 'analyze': - analyze(args.inp, args.sout, args.eout, args.sample, args.log_file) + analyze(args.inp, args.sout, args.eout, int(args.sample), args.log_file) elif args.command == 'rank': - rank_entities(args.entity_input, args.sentiment, args.sample, args.reverse) + rank_entities(args.entity_input, args.sentiment, int(args.sample), args.reverse) From 3e4deb12216fd61f7c1acebd4e2d7d167f2ab38d Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Tue, 19 Jul 2016 19:52:20 -0700 Subject: [PATCH 07/10] added sample variable Change-Id: Ic2b721f775601d2dfb718be26d425e7c2e91eb8a --- language/movie_nl/main.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index bebdd59c3fd..36f36109e27 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -325,7 +325,12 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): args = parser.parse_args() + sample = args.sample + + if args.sample is not None: + sample = int(args.sample) + if args.command == 'analyze': - analyze(args.inp, args.sout, args.eout, int(args.sample), args.log_file) + analyze(args.inp, args.sout, args.eout, sample, args.log_file) elif args.command == 'rank': - rank_entities(args.entity_input, args.sentiment, int(args.sample), args.reverse) + rank_entities(args.entity_input, args.sentiment, sample, args.reverse) From 4ffc546478f78db358c3981781a9551931946db5 Mon Sep 17 00:00:00 2001 From: Puneith Kaul Date: Tue, 19 Jul 2016 20:09:12 -0700 Subject: [PATCH 08/10] fixed indentation bug Change-Id: I2eac2d41e3f8d96c1d049dbbc7b48c96d5fded0a --- language/movie_nl/main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 36f36109e27..9238be17569 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -105,15 +105,15 @@ def extract_all_sentences(self, service): sentences = response.get('sentences', []) sent_list = [ - sentence.get('text').get('content')for sentence in sentences + sentence.get('text').get('content') for sentence in sentences ] - for entity in entities: - ent_type = entity.get('type') - wiki_url = entity.get('metadata', {}).get('wikipedia_url') + for entity in entities: + ent_type = entity.get('type') + wiki_url = entity.get('metadata', {}).get('wikipedia_url') - if ent_type == 'PERSON' and wiki_url is not None: - ent_list.append(wiki_url) + if ent_type == 'PERSON' and wiki_url is not None: + ent_list.append(wiki_url) self.sentent_pair = (sent_list, ent_list) From 7ca9918f84f8e71934b24bfe9385563ee8266f17 Mon Sep 17 00:00:00 2001 From: Jerjou Cheng Date: Tue, 19 Jul 2016 20:07:40 -0700 Subject: [PATCH 09/10] Fix lint errors Change-Id: I026963be93ca3ef776420807e14abdcfcc5fe95f --- language/movie_nl/main.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py index 9238be17569..b7b342af9d5 100644 --- a/language/movie_nl/main.py +++ b/language/movie_nl/main.py @@ -303,12 +303,14 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): rank_parser = subparsers.add_parser('rank') rank_parser.add_argument( - 'entity_input', help='location of entity input', type=argparse.FileType('r')) + 'entity_input', help='location of entity input', + type=argparse.FileType('r')) rank_parser.add_argument( '--sentiment', help='filter sentiment as "neg" or "pos"') rank_parser.add_argument( '--reverse', help='reverse the order of the items') - rank_parser.add_argument('--sample', help='number of top items to process') + rank_parser.add_argument( + '--sample', help='number of top items to process', type=int) analyze_parser = subparsers.add_parser('analyze') @@ -320,17 +322,14 @@ def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): analyze_parser.add_argument( '--eout', help='location of the entity output', required=True, type=argparse.FileType('w')) - analyze_parser.add_argument('--sample', help='number of top items to process') + analyze_parser.add_argument( + '--sample', help='number of top items to process', type=int) analyze_parser.add_argument('--log_file', default='movie.log') args = parser.parse_args() - sample = args.sample - - if args.sample is not None: - sample = int(args.sample) - if args.command == 'analyze': - analyze(args.inp, args.sout, args.eout, sample, args.log_file) + analyze(args.inp, args.sout, args.eout, args.sample, args.log_file) elif args.command == 'rank': - rank_entities(args.entity_input, args.sentiment, sample, args.reverse) + rank_entities( + args.entity_input, args.sentiment, args.sample, args.reverse) From 0be30eb388c4f21f667ec5d6e2e3fb5652c433f8 Mon Sep 17 00:00:00 2001 From: Jerjou Cheng Date: Tue, 19 Jul 2016 20:13:53 -0700 Subject: [PATCH 10/10] Remove movie_nl sample until it's more stable. Change-Id: Iab58378f40c760e88f2b3cc0806985894ce77e76 --- language/README.md | 3 - language/movie_nl/README.md | 152 ------------- language/movie_nl/main.py | 335 ----------------------------- language/movie_nl/main_test.py | 82 ------- language/movie_nl/requirements.txt | 3 - 5 files changed, 575 deletions(-) delete mode 100644 language/movie_nl/README.md delete mode 100644 language/movie_nl/main.py delete mode 100644 language/movie_nl/main_test.py delete mode 100644 language/movie_nl/requirements.txt diff --git a/language/README.md b/language/README.md index e63d45eb9a6..130ce66ff83 100644 --- a/language/README.md +++ b/language/README.md @@ -5,9 +5,6 @@ This directory contains Python examples that use the - [api](api) has a simple command line tool that shows off the API's features. -- [movie_nl](movie_nl) combines sentiment and entity analysis to come up with -actors/directors who are the most and least popular in the imdb movie reviews. - - [ocr_nl](ocr_nl) uses the [Cloud Vision API](https://cloud.google.com/vision/) to extract text from images, then uses the NL API to extract entity information from those texts, and stores the extracted information in a database in support diff --git a/language/movie_nl/README.md b/language/movie_nl/README.md deleted file mode 100644 index dab79d20c6d..00000000000 --- a/language/movie_nl/README.md +++ /dev/null @@ -1,152 +0,0 @@ -# Introduction -This sample is an application of the Google Cloud Platform Natural Language API. -It uses the [imdb movie reviews data set](https://www.cs.cornell.edu/people/pabo/movie-review-data/) -from [Cornell University](http://www.cs.cornell.edu/) and performs sentiment & entity -analysis on it. It combines the capabilities of sentiment analysis and entity recognition -to come up with actors/directors who are the most and least popular. - -### Set Up to Authenticate With Your Project's Credentials - -Please follow the [Set Up Your Project](https://cloud.google.com/natural-language/docs/getting-started#set_up_your_project) -steps in the Quickstart doc to create a project and enable the -Cloud Natural Language API. Following those steps, make sure that you -[Set Up a Service Account](https://cloud.google.com/natural-language/docs/common/auth#set_up_a_service_account), -and export the following environment variable: - -``` -export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json -``` - -**Note:** If you get an error saying your API hasn't been enabled, make sure -that you have correctly set this environment variable, and that the project that -you got the service account from has the Natural Language API enabled. - -## How it works -This sample uses the Natural Language API to annotate the input text. The -movie review document is broken into sentences using the `extract_syntax` feature. -Each sentence is sent to the API for sentiment analysis. The positive and negative -sentiment values are combined to come up with a single overall sentiment of the -movie document. - -In addition to the sentiment, the program also extracts the entities of type -`PERSON`, who are the actors in the movie (including the director and anyone -important). These entities are assigned the sentiment value of the document to -come up with the most and least popular actors/directors. - -### Movie document -We define a movie document as a set of reviews. These reviews are individual -sentences and we use the NL API to extract the sentences from the document. See -an example movie document below. - -``` - Sample review sentence 1. Sample review sentence 2. Sample review sentence 3. -``` - -### Sentences and Sentiment -Each sentence from the above document is assigned a sentiment as below. - -``` - Sample review sentence 1 => Sentiment 1 - Sample review sentence 2 => Sentiment 2 - Sample review sentence 3 => Sentiment 3 -``` - -### Sentiment computation -The final sentiment is computed by simply adding the sentence sentiments. - -``` - Total Sentiment = Sentiment 1 + Sentiment 2 + Sentiment 3 -``` - - -### Entity extraction and Sentiment assignment -Entities with type `PERSON` are extracted from the movie document using the NL -API. Since these entities are mentioned in their respective movie document, -they are associated with the document sentiment. - -``` - Document 1 => Sentiment 1 - - Person 1 - Person 2 - Person 3 - - Document 2 => Sentiment 2 - - Person 2 - Person 4 - Person 5 -``` - -Based on the above data we can calculate the sentiment associated with Person 2: - -``` - Person 2 => (Sentiment 1 + Sentiment 2) -``` - -## Movie Data Set -We have used the Cornell Movie Review data as our input. Please follow the instructions below to download and extract the data. - -### Download Instructions - -``` - $ curl -O http://www.cs.cornell.edu/people/pabo/movie-review-data/mix20_rand700_tokens.zip - $ unzip mix20_rand700_tokens.zip -``` - -## Command Line Usage -In order to use the movie analyzer, follow the instructions below. (Note that the `--sample` parameter below runs the script on -fewer documents, and can be omitted to run it on the entire corpus) - -### Install Dependencies - -Install [pip](https://pip.pypa.io/en/stable/installing) if not already installed. - -Then, install dependencies by running the following pip command: - -``` -$ pip install -r requirements.txt -``` -### How to Run - -``` -$ python main.py analyze --inp "tokens/*/*" \ - --sout sentiment.json \ - --eout entity.json \ - --sample 5 -``` - -You should see the log file `movie.log` created. - -## Output Data -The program produces sentiment and entity output in json format. For example: - -### Sentiment Output -``` - { - "doc_id": "cv310_tok-16557.txt", - "sentiment": 3.099, - "label": -1 - } -``` - -### Entity Output - -``` - { - "name": "Sean Patrick Flanery", - "wiki_url": "http://en.wikipedia.org/wiki/Sean_Patrick_Flanery", - "sentiment": 3.099 - } -``` - -### Entity Output Sorting -In order to sort and rank the entities generated, use the same `main.py` script. For example, -this will print the top 5 actors with negative sentiment: - -``` -$ python main.py rank entity.json \ - --sentiment neg \ - --reverse True \ - --sample 5 -``` diff --git a/language/movie_nl/main.py b/language/movie_nl/main.py deleted file mode 100644 index b7b342af9d5..00000000000 --- a/language/movie_nl/main.py +++ /dev/null @@ -1,335 +0,0 @@ -# Copyright 2016 Google, Inc -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import codecs -import glob -import json -import logging -import os - -from googleapiclient import discovery -import httplib2 -from oauth2client.client import GoogleCredentials -import requests - - -def analyze_document(service, document): - """Analyze the document and get the distribution of sentiments and - the movie name.""" - logging.info('Analyzing {}'.format(document.doc_id)) - - sentences, entities = document.extract_all_sentences(service) - sentiments = [get_sentiment(service, sentence) for sentence in sentences] - - return sentiments, entities - - -def get_request_body(text, syntax=True, entities=True, sentiment=True): - """Creates the body of the request to the language api in - order to get an appropriate api response.""" - body = { - 'document': { - 'type': 'PLAIN_TEXT', - 'content': text, - }, - 'features': { - 'extract_syntax': syntax, - 'extract_entities': entities, - 'extract_document_sentiment': sentiment, - }, - 'encoding_type': 'UTF32' - } - - return body - - -def get_sentiment(service, sentence): - """Get the sentence-level sentiment.""" - body = get_request_body( - sentence, syntax=False, entities=True, sentiment=True) - - docs = service.documents() - request = docs.annotateText(body=body) - response = request.execute() - sentiment = response.get("documentSentiment") - - if sentiment is None: - return (None, None) - else: - pol = sentiment.get("polarity") - mag = sentiment.get("magnitude") - - if pol is None and mag is not None: - pol = 0 - return (pol, mag) - - -class Document(object): - """Document class captures a single document of movie reviews.""" - - def __init__(self, text, doc_id, doc_path): - self.text = text - self.doc_id = doc_id - self.doc_path = doc_path - self.sentent_pair = None - self.label = None - - def extract_all_sentences(self, service): - """Extract the sentences in a document.""" - - if self.sentent_pair is None: - docs = service.documents() - request_body = get_request_body( - self.text, - syntax=True, - entities=True, - sentiment=False) - request = docs.annotateText(body=request_body) - - ent_list = [] - - response = request.execute() - entities = response.get('entities', []) - sentences = response.get('sentences', []) - - sent_list = [ - sentence.get('text').get('content') for sentence in sentences - ] - - for entity in entities: - ent_type = entity.get('type') - wiki_url = entity.get('metadata', {}).get('wikipedia_url') - - if ent_type == 'PERSON' and wiki_url is not None: - ent_list.append(wiki_url) - - self.sentent_pair = (sent_list, ent_list) - - return self.sentent_pair - - -def to_sentiment_json(doc_id, sent, label): - """Convert the sentiment info to json.""" - json_doc = {} - - json_doc['doc_id'] = doc_id - json_doc['sentiment'] = float('%.3f' % sent) - json_doc['label'] = label - - return json.dumps(json_doc) - - -def get_wiki_title(wiki_url): - """Get the wikipedia page title for a given wikipedia URL.""" - try: - content = requests.get(wiki_url).text - return content.split('title')[1].split('-')[0].split('>')[1].strip() - except: - return os.path.basename(wiki_url).replace('_', ' ') - - -def to_entity_json(entity, e_tuple): - """Convert the entity info to json.""" - json_doc = {} - - avg_sentiment = float(e_tuple[0]) / float(e_tuple[1]) - - json_doc['wiki_url'] = entity - json_doc['name'] = get_wiki_title(entity) - json_doc['sentiment'] = float('%.3f' % e_tuple[0]) - json_doc['avg_sentiment'] = float('%.3f' % avg_sentiment) - - return json.dumps(json_doc) - - -def get_sentiment_entities(service, document): - """Compute the overall sentiment volume in the document""" - sentiments, entities = analyze_document(service, document) - - sentiments = [sent for sent in sentiments if sent[0] is not None] - negative_sentiments = [ - polarity for polarity, magnitude in sentiments if polarity < 0.0] - positive_sentiments = [ - polarity for polarity, magnitude in sentiments if polarity > 0.0] - - negative = sum(negative_sentiments) - positive = sum(positive_sentiments) - total = positive + negative - - return (total, entities) - - -def get_sentiment_label(sentiment): - """Return the sentiment label based on the sentiment quantity.""" - if sentiment < 0: - return -1 - elif sentiment > 0: - return 1 - else: - return 0 - - -def process_movie_reviews(service, reader, sentiment_writer, entity_writer): - """Perform some sentiment math and come up with movie review.""" - collected_entities = {} - - for document in reader: - sentiment_total, entities = get_sentiment_entities( - service, document) - document.label = get_sentiment_label(sentiment_total) - - sentiment_writer.write( - to_sentiment_json( - document.doc_id, - sentiment_total, - document.label - ) - ) - - sentiment_writer.write('\n') - - for ent in entities: - ent_sent, frequency = collected_entities.get(ent, (0, 0)) - ent_sent += sentiment_total - frequency += 1 - - collected_entities[ent] = (ent_sent, frequency) - - for entity, e_tuple in collected_entities.items(): - entity_writer.write(to_entity_json(entity, e_tuple)) - entity_writer.write('\n') - - sentiment_writer.flush() - entity_writer.flush() - - -def document_generator(dir_path_pattern, count=None): - """Generator for the input movie documents.""" - for running_count, item in enumerate(glob.iglob(dir_path_pattern)): - if count and running_count >= count: - raise StopIteration() - - doc_id = os.path.basename(item) - - with codecs.open(item, encoding='utf-8') as f: - try: - text = f.read() - except UnicodeDecodeError: - continue - - yield Document(text, doc_id, item) - - -def rank_entities(reader, sentiment=None, topn=None, reverse_bool=False): - """Rank the entities (actors) based on their sentiment - assigned from the movie.""" - - items = [] - for item in reader: - json_item = json.loads(item) - sent = json_item.get('sentiment') - entity_item = (sent, json_item) - - if sentiment: - if sentiment == 'pos' and sent > 0: - items.append(entity_item) - elif sentiment == 'neg' and sent < 0: - items.append(entity_item) - else: - items.append(entity_item) - - items.sort(reverse=True) - items = [json.dumps(item[1]) for item in items] - - if reverse_bool: - items.reverse() - - if topn: - print('\n'.join(items[:topn])) - else: - print('\n'.join(items)) - - -def get_service(): - """Build a client to the Google Cloud Natural Language API.""" - - credentials = GoogleCredentials.get_application_default() - scoped_credentials = credentials.create_scoped( - ['https://www.googleapis.com/auth/cloud-platform']) - http = httplib2.Http() - scoped_credentials.authorize(http) - return discovery.build('language', 'v1beta1', http=http) - - -def analyze(input_dir, sentiment_writer, entity_writer, sample, log_file): - """Movie demo main program""" - - # Create logger settings - logging.basicConfig(filename=log_file, level=logging.DEBUG) - - # Create a Google Service object - service = get_service() - - reader = document_generator(input_dir, sample) - - # Process the movie documents - process_movie_reviews(service, reader, sentiment_writer, entity_writer) - - # close reader and writers - sentiment_writer.close() - entity_writer.close() - reader.close() - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - - subparsers = parser.add_subparsers(dest='command') - - rank_parser = subparsers.add_parser('rank') - - rank_parser.add_argument( - 'entity_input', help='location of entity input', - type=argparse.FileType('r')) - rank_parser.add_argument( - '--sentiment', help='filter sentiment as "neg" or "pos"') - rank_parser.add_argument( - '--reverse', help='reverse the order of the items') - rank_parser.add_argument( - '--sample', help='number of top items to process', type=int) - - analyze_parser = subparsers.add_parser('analyze') - - analyze_parser.add_argument( - '--inp', help='location of the input', required=True) - analyze_parser.add_argument( - '--sout', help='location of the sentiment output', required=True, - type=argparse.FileType('w')) - analyze_parser.add_argument( - '--eout', help='location of the entity output', required=True, - type=argparse.FileType('w')) - analyze_parser.add_argument( - '--sample', help='number of top items to process', type=int) - analyze_parser.add_argument('--log_file', default='movie.log') - - args = parser.parse_args() - - if args.command == 'analyze': - analyze(args.inp, args.sout, args.eout, args.sample, args.log_file) - elif args.command == 'rank': - rank_entities( - args.entity_input, args.sentiment, args.sample, args.reverse) diff --git a/language/movie_nl/main_test.py b/language/movie_nl/main_test.py deleted file mode 100644 index 96907908018..00000000000 --- a/language/movie_nl/main_test.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright 2016 Google, Inc -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import io -import json - -import main - - -def test_get_request_body(): - text = 'hello world' - body = main.get_request_body(text, syntax=True, entities=True, - sentiment=False) - assert body.get('document').get('content') == text - - assert body.get('features').get('extract_syntax') is True - assert body.get('features').get('extract_entities') is True - assert body.get('features').get('extract_document_sentiment') is False - - -def test_get_sentiment_label(): - assert main.get_sentiment_label(20.50) == 1 - assert main.get_sentiment_label(-42.34) == -1 - - -def test_to_sentiment_json(): - doc_id = '12345' - sentiment = 23.344564 - label = 1 - - sentiment_json = json.loads( - main.to_sentiment_json(doc_id, sentiment, label) - ) - - assert sentiment_json.get('doc_id') == doc_id - assert sentiment_json.get('sentiment') == 23.345 - assert sentiment_json.get('label') == label - - -def test_process_movie_reviews(): - service = main.get_service() - - doc1 = main.Document('Top Gun was awesome and Tom Cruise rocked!', 'doc1', - 'doc1') - doc2 = main.Document('Tom Cruise is a great actor.', 'doc2', 'doc2') - - reader = [doc1, doc2] - swriter = io.StringIO() - ewriter = io.StringIO() - - main.process_movie_reviews(service, reader, swriter, ewriter) - - sentiments = swriter.getvalue().strip().split('\n') - entities = ewriter.getvalue().strip().split('\n') - - sentiments = [json.loads(sentiment) for sentiment in sentiments] - entities = [json.loads(entity) for entity in entities] - - # assert sentiments - assert sentiments[0].get('sentiment') == 1.0 - assert sentiments[0].get('label') == 1 - - assert sentiments[1].get('sentiment') == 1.0 - assert sentiments[1].get('label') == 1 - - # assert entities - assert len(entities) == 1 - assert entities[0].get('name') == 'Tom Cruise' - assert (entities[0].get('wiki_url') == - 'http://en.wikipedia.org/wiki/Tom_Cruise') - assert entities[0].get('sentiment') == 2.0 diff --git a/language/movie_nl/requirements.txt b/language/movie_nl/requirements.txt deleted file mode 100644 index 391be2e9843..00000000000 --- a/language/movie_nl/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -urlparse2==1.1.1 -google-api-python-client==1.5.1 -requests==2.10.0